mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +78 -30
- mteb/_evaluators/any_sts_evaluator.py +13 -6
- mteb/_evaluators/clustering_evaluator.py +13 -5
- mteb/_evaluators/evaluator.py +12 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
- mteb/_evaluators/pair_classification_evaluator.py +17 -7
- mteb/_evaluators/retrieval_evaluator.py +23 -14
- mteb/_evaluators/retrieval_metrics.py +26 -19
- mteb/_evaluators/sklearn_evaluator.py +27 -17
- mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
- mteb/_evaluators/text/summarization_evaluator.py +31 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +9 -3
- mteb/abstasks/_data_filter/task_pipelines.py +10 -2
- mteb/abstasks/_statistics_calculation.py +21 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +78 -44
- mteb/abstasks/aggregate_task_metadata.py +21 -18
- mteb/abstasks/aggregated_task.py +23 -35
- mteb/abstasks/classification.py +39 -18
- mteb/abstasks/clustering.py +37 -20
- mteb/abstasks/clustering_legacy.py +30 -16
- mteb/abstasks/image/image_text_pair_classification.py +26 -9
- mteb/abstasks/multilabel_classification.py +33 -21
- mteb/abstasks/pair_classification.py +44 -19
- mteb/abstasks/regression.py +18 -10
- mteb/abstasks/retrieval.py +82 -52
- mteb/abstasks/retrieval_dataset_loaders.py +50 -39
- mteb/abstasks/sts.py +34 -15
- mteb/abstasks/task_metadata.py +44 -37
- mteb/abstasks/text/bitext_mining.py +57 -35
- mteb/abstasks/text/reranking.py +10 -8
- mteb/abstasks/text/summarization.py +26 -10
- mteb/abstasks/zeroshot_classification.py +27 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +15 -3
- mteb/benchmarks/benchmarks/__init__.py +6 -0
- mteb/benchmarks/benchmarks/benchmarks.py +153 -13
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +189 -31
- mteb/cli/_display_tasks.py +10 -4
- mteb/cli/build_cli.py +112 -13
- mteb/cli/generate_model_card.py +50 -23
- mteb/deprecated_evaluator.py +72 -54
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +71 -47
- mteb/filter_tasks.py +36 -32
- mteb/get_tasks.py +37 -33
- mteb/languages/language_scripts.py +11 -4
- mteb/leaderboard/app.py +172 -37
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +20 -14
- mteb/models/abs_encoder.py +30 -16
- mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +16 -11
- mteb/models/get_model_meta.py +53 -9
- mteb/models/instruct_wrapper.py +41 -13
- mteb/models/model_implementations/align_models.py +11 -5
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +6 -4
- mteb/models/model_implementations/ara_models.py +2 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +85 -22
- mteb/models/model_implementations/bica_model.py +4 -3
- mteb/models/model_implementations/blip2_models.py +13 -6
- mteb/models/model_implementations/blip_models.py +33 -20
- mteb/models/model_implementations/bm25.py +27 -17
- mteb/models/model_implementations/bmretriever_models.py +16 -6
- mteb/models/model_implementations/cadet_models.py +2 -1
- mteb/models/model_implementations/cde_models.py +22 -9
- mteb/models/model_implementations/clip_models.py +18 -10
- mteb/models/model_implementations/clips_models.py +6 -3
- mteb/models/model_implementations/codefuse_models.py +10 -5
- mteb/models/model_implementations/codesage_models.py +6 -3
- mteb/models/model_implementations/cohere_models.py +19 -9
- mteb/models/model_implementations/cohere_v.py +16 -6
- mteb/models/model_implementations/colpali_models.py +10 -6
- mteb/models/model_implementations/colqwen_models.py +24 -38
- mteb/models/model_implementations/colsmol_models.py +5 -3
- mteb/models/model_implementations/conan_models.py +12 -5
- mteb/models/model_implementations/dino_models.py +70 -46
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +18 -9
- mteb/models/model_implementations/e5_v.py +16 -10
- mteb/models/model_implementations/eagerworks_models.py +12 -5
- mteb/models/model_implementations/emillykkejensen_models.py +9 -6
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +3 -2
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +18 -9
- mteb/models/model_implementations/facebookai.py +16 -2
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +13 -8
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
- mteb/models/model_implementations/gritlm_models.py +5 -2
- mteb/models/model_implementations/gte_models.py +34 -13
- mteb/models/model_implementations/hinvec_models.py +7 -2
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +16 -7
- mteb/models/model_implementations/jina_clip.py +58 -14
- mteb/models/model_implementations/jina_models.py +35 -16
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +13 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
- mteb/models/model_implementations/kfst.py +2 -1
- mteb/models/model_implementations/kowshik24_models.py +2 -1
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +8 -2
- mteb/models/model_implementations/listconranker.py +11 -5
- mteb/models/model_implementations/llm2clip_models.py +18 -10
- mteb/models/model_implementations/llm2vec_models.py +28 -14
- mteb/models/model_implementations/mcinext_models.py +12 -3
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +131 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
- mteb/models/model_implementations/mme5_models.py +3 -2
- mteb/models/model_implementations/moco_models.py +15 -8
- mteb/models/model_implementations/mod_models.py +3 -2
- mteb/models/model_implementations/model2vec_models.py +37 -18
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +6 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
- mteb/models/model_implementations/nomic_models.py +47 -19
- mteb/models/model_implementations/nomic_models_vision.py +6 -4
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
- mteb/models/model_implementations/nvidia_models.py +165 -22
- mteb/models/model_implementations/octen_models.py +64 -3
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +30 -17
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
- mteb/models/model_implementations/ops_moa_models.py +10 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
- mteb/models/model_implementations/pawan_models.py +2 -1
- mteb/models/model_implementations/piccolo_models.py +3 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +20 -10
- mteb/models/model_implementations/pylate_models.py +41 -21
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +14 -4
- mteb/models/model_implementations/qzhou_models.py +4 -2
- mteb/models/model_implementations/random_baseline.py +7 -6
- mteb/models/model_implementations/rasgaard_models.py +3 -2
- mteb/models/model_implementations/reasonir_model.py +66 -1
- mteb/models/model_implementations/repllama_models.py +18 -9
- mteb/models/model_implementations/rerankers_custom.py +25 -10
- mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +40 -20
- mteb/models/model_implementations/ruri_models.py +20 -10
- mteb/models/model_implementations/salesforce_models.py +13 -4
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +142 -22
- mteb/models/model_implementations/shuu_model.py +2 -1
- mteb/models/model_implementations/siglip_models.py +39 -24
- mteb/models/model_implementations/slm_models.py +419 -0
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +4 -2
- mteb/models/model_implementations/text2vec_models.py +12 -3
- mteb/models/model_implementations/ua_sentence_models.py +2 -1
- mteb/models/model_implementations/uae_models.py +17 -5
- mteb/models/model_implementations/vdr_models.py +9 -2
- mteb/models/model_implementations/vi_vn_models.py +12 -6
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +14 -7
- mteb/models/model_implementations/voyage_models.py +136 -4
- mteb/models/model_implementations/voyage_v.py +17 -10
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +2 -1
- mteb/models/model_implementations/yuan_models_en.py +3 -2
- mteb/models/model_meta.py +127 -40
- mteb/models/models_protocols.py +43 -22
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
- mteb/models/search_wrappers.py +63 -29
- mteb/models/sentence_transformer_wrapper.py +52 -26
- mteb/models/vllm_wrapper.py +329 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +48 -35
- mteb/results/model_result.py +68 -32
- mteb/results/task_result.py +110 -72
- mteb/similarity_functions.py +19 -9
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +2 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +16 -16
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +44 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +3 -3
- mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +13 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +18 -5
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from mteb.abstasks.
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.sts.multilingual.sts_benchmark_multilingual_visual_sts import (
|
|
4
4
|
STSBenchmarkMultilingualVisualSTS,
|
|
5
5
|
)
|
|
6
6
|
|
|
7
|
-
task_list_stsb
|
|
7
|
+
task_list_stsb = [
|
|
8
8
|
STSBenchmarkMultilingualVisualSTS().filter_languages(
|
|
9
9
|
languages=["eng"], hf_subsets=["en"]
|
|
10
10
|
)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from mteb.abstasks import
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.retrieval import (
|
|
4
4
|
CQADupstackAndroidRetrievalFa,
|
|
5
5
|
CQADupstackEnglishRetrievalFa,
|
|
@@ -15,7 +15,7 @@ from mteb.tasks.retrieval import (
|
|
|
15
15
|
CQADupstackWordpressRetrievalFa,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
-
task_list_cqa
|
|
18
|
+
task_list_cqa = [
|
|
19
19
|
CQADupstackAndroidRetrievalFa(),
|
|
20
20
|
CQADupstackEnglishRetrievalFa(),
|
|
21
21
|
CQADupstackGamingRetrievalFa(),
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from mteb.abstasks import
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.classification import (
|
|
4
4
|
SynPerChatbotConvSAAnger,
|
|
5
5
|
SynPerChatbotConvSAFear,
|
|
@@ -12,7 +12,7 @@ from mteb.tasks.classification import (
|
|
|
12
12
|
SynPerChatbotConvSASurprise,
|
|
13
13
|
)
|
|
14
14
|
|
|
15
|
-
task_list_cqa
|
|
15
|
+
task_list_cqa = [
|
|
16
16
|
SynPerChatbotConvSAAnger(),
|
|
17
17
|
SynPerChatbotConvSASatisfaction(),
|
|
18
18
|
SynPerChatbotConvSAFriendship(),
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from mteb.abstasks.
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.sts.multilingual.sts17_multilingual_visual_sts import (
|
|
4
4
|
STS17MultilingualVisualSTS,
|
|
5
5
|
)
|
|
6
6
|
|
|
7
|
-
task_list_sts17_multi
|
|
7
|
+
task_list_sts17_multi = [
|
|
8
8
|
STS17MultilingualVisualSTS().filter_languages(
|
|
9
9
|
languages=["ara", "eng", "spa", "kor"],
|
|
10
10
|
hf_subsets=[
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from mteb.abstasks.
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.sts.multilingual.sts_benchmark_multilingual_visual_sts import (
|
|
4
4
|
STSBenchmarkMultilingualVisualSTS,
|
|
5
5
|
)
|
|
6
6
|
|
|
7
|
-
task_list_multi
|
|
7
|
+
task_list_multi = [
|
|
8
8
|
STSBenchmarkMultilingualVisualSTS().filter_languages(
|
|
9
9
|
languages=[
|
|
10
10
|
"deu",
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from mteb.abstasks import
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.retrieval import (
|
|
4
4
|
CQADupstackAndroidNLRetrieval,
|
|
5
5
|
CQADupstackEnglishNLRetrieval,
|
|
@@ -15,7 +15,7 @@ from mteb.tasks.retrieval import (
|
|
|
15
15
|
CQADupstackWordpressNLRetrieval,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
-
task_list_cqa
|
|
18
|
+
task_list_cqa = [
|
|
19
19
|
CQADupstackAndroidNLRetrieval(),
|
|
20
20
|
CQADupstackEnglishNLRetrieval(),
|
|
21
21
|
CQADupstackGamingNLRetrieval(),
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from mteb.abstasks import
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.retrieval.pol.cqadupstack_pl_retrieval import (
|
|
4
4
|
CQADupstackAndroidRetrievalPL,
|
|
5
5
|
CQADupstackEnglishRetrievalPL,
|
|
@@ -15,7 +15,7 @@ from mteb.tasks.retrieval.pol.cqadupstack_pl_retrieval import (
|
|
|
15
15
|
CQADupstackWordpressRetrievalPL,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
-
task_list_cqa
|
|
18
|
+
task_list_cqa = [
|
|
19
19
|
CQADupstackAndroidRetrievalPL(),
|
|
20
20
|
CQADupstackEnglishRetrievalPL(),
|
|
21
21
|
CQADupstackGamingRetrievalPL(),
|
|
@@ -59,7 +59,7 @@ class PubChemSMILESBitextMining(AbsTaskBitextMining):
|
|
|
59
59
|
""",
|
|
60
60
|
)
|
|
61
61
|
|
|
62
|
-
def dataset_transform(self):
|
|
62
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
63
63
|
for subset in self.hf_subsets:
|
|
64
64
|
self.dataset[subset] = self.dataset[subset].rename_columns(
|
|
65
65
|
COL_MAPPING[subset]
|
|
@@ -27,7 +27,7 @@ class SAMSumFa(AbsTaskBitextMining):
|
|
|
27
27
|
bibtex_citation="",
|
|
28
28
|
)
|
|
29
29
|
|
|
30
|
-
def dataset_transform(self):
|
|
30
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
31
31
|
self.dataset = self.dataset.rename_columns(
|
|
32
32
|
{"text": "sentence1", "summary": "sentence2"}
|
|
33
33
|
)
|
|
@@ -58,7 +58,7 @@ class SynPerChatbotSumSRetrieval(AbsTaskBitextMining):
|
|
|
58
58
|
bibtex_citation=""" """,
|
|
59
59
|
)
|
|
60
60
|
|
|
61
|
-
def dataset_transform(self):
|
|
61
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
62
62
|
self.dataset = self.dataset.rename_columns(
|
|
63
63
|
{"text": "sentence1", "summary": "sentence2"}
|
|
64
64
|
)
|
|
@@ -89,7 +89,7 @@ class SynPerChatbotRAGSumSRetrieval(AbsTaskBitextMining):
|
|
|
89
89
|
bibtex_citation=""" """,
|
|
90
90
|
)
|
|
91
91
|
|
|
92
|
-
def dataset_transform(self):
|
|
92
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
93
93
|
self.dataset = self.dataset.rename_columns(
|
|
94
94
|
{"text": "sentence1", "summary": "sentence2"}
|
|
95
95
|
)
|
|
@@ -35,7 +35,7 @@ class NorwegianCourtsBitextMining(AbsTaskBitextMining):
|
|
|
35
35
|
prompt="Retrieve parallel sentences in Norwegian Bokmål and Nynorsk",
|
|
36
36
|
)
|
|
37
37
|
|
|
38
|
-
def dataset_transform(self):
|
|
38
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
39
39
|
# Convert to standard format
|
|
40
40
|
self.dataset = self.dataset.rename_column("nb", "sentence1")
|
|
41
41
|
self.dataset = self.dataset.rename_column("nn", "sentence2")
|
|
@@ -32,7 +32,7 @@ class RomaTalesBitextMining(AbsTaskBitextMining):
|
|
|
32
32
|
bibtex_citation="",
|
|
33
33
|
)
|
|
34
34
|
|
|
35
|
-
def load_data(self) -> None:
|
|
35
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
36
36
|
"""Load dataset from HuggingFace hub and convert it to the standard format."""
|
|
37
37
|
if self.data_loaded:
|
|
38
38
|
return
|
|
@@ -44,7 +44,7 @@ class RomaTalesBitextMining(AbsTaskBitextMining):
|
|
|
44
44
|
self.dataset_transform()
|
|
45
45
|
self.data_loaded = True
|
|
46
46
|
|
|
47
|
-
def dataset_transform(self):
|
|
47
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
48
48
|
for lang in self.hf_subsets:
|
|
49
49
|
self.dataset[lang] = self.dataset[lang].rename_columns(
|
|
50
50
|
{"romani": "sentence1", "hungarian": "sentence2"}
|
|
@@ -230,7 +230,7 @@ class WebFAQBitextMiningQuestions(AbsTaskBitextMining):
|
|
|
230
230
|
""",
|
|
231
231
|
)
|
|
232
232
|
|
|
233
|
-
def dataset_transform(self):
|
|
233
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
234
234
|
dataset = {}
|
|
235
235
|
for langs in self.dataset:
|
|
236
236
|
dataset[langs] = {}
|
|
@@ -284,7 +284,7 @@ class WebFAQBitextMiningQAs(AbsTaskBitextMining):
|
|
|
284
284
|
""",
|
|
285
285
|
)
|
|
286
286
|
|
|
287
|
-
def dataset_transform(self):
|
|
287
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
288
288
|
dataset = {}
|
|
289
289
|
for langs in self.dataset:
|
|
290
290
|
dataset[langs] = {}
|
|
@@ -28,7 +28,7 @@ class OnlineStoreReviewSentimentClassification(AbsTaskClassification):
|
|
|
28
28
|
superseded_by="OnlineStoreReviewSentimentClassification.v2",
|
|
29
29
|
)
|
|
30
30
|
|
|
31
|
-
def dataset_transform(self):
|
|
31
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
32
32
|
self.dataset = self.stratified_subsampling(
|
|
33
33
|
self.dataset, seed=self.seed, splits=["train"]
|
|
34
34
|
)
|
|
@@ -37,7 +37,7 @@ class RestaurantReviewSentimentClassification(AbsTaskClassification):
|
|
|
37
37
|
superseded_by="RestaurantReviewSentimentClassification.v2",
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
-
def dataset_transform(self):
|
|
40
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
41
41
|
# labels: 0 negative, 1 positive
|
|
42
42
|
self.dataset = self.dataset.rename_column("polarity", "label")
|
|
43
43
|
self.dataset = self.stratified_subsampling(
|
|
@@ -48,7 +48,7 @@ Mubarak, Hamdy},
|
|
|
48
48
|
superseded_by="TweetSarcasmClassification.v2",
|
|
49
49
|
)
|
|
50
50
|
|
|
51
|
-
def dataset_transform(self):
|
|
51
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
52
52
|
# labels: 0 non-sarcastic, 1 sarcastic
|
|
53
53
|
self.dataset = self.dataset.rename_columns(
|
|
54
54
|
{"tweet": "text", "sarcasm": "label"}
|
|
@@ -36,7 +36,7 @@ class BengaliHateSpeechClassification(AbsTaskClassification):
|
|
|
36
36
|
superseded_by="BengaliHateSpeechClassification.v2",
|
|
37
37
|
)
|
|
38
38
|
|
|
39
|
-
def dataset_transform(self):
|
|
39
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
40
40
|
self.dataset = self.stratified_subsampling(
|
|
41
41
|
self.dataset, seed=self.seed, splits=["train"]
|
|
42
42
|
)
|
|
@@ -36,7 +36,7 @@ class BengaliSentimentAnalysis(AbsTaskClassification):
|
|
|
36
36
|
superseded_by="BengaliSentimentAnalysis.v2",
|
|
37
37
|
)
|
|
38
38
|
|
|
39
|
-
def dataset_transform(self):
|
|
39
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
40
40
|
self.dataset = self.stratified_subsampling(
|
|
41
41
|
self.dataset, seed=self.seed, splits=["train"]
|
|
42
42
|
)
|
|
@@ -37,7 +37,7 @@ class BulgarianStoreReviewSentimentClassfication(AbsTaskClassification):
|
|
|
37
37
|
""",
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
-
def dataset_transform(self):
|
|
40
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
41
41
|
self.dataset = self.dataset.rename_columns(
|
|
42
42
|
{"Review": "text", "Category": "label"}
|
|
43
43
|
)
|
|
@@ -39,7 +39,7 @@ class CSFDCZMovieReviewSentimentClassification(AbsTaskClassification):
|
|
|
39
39
|
# Increase the samples_per_label in order to improve baseline performance
|
|
40
40
|
samples_per_label = 20
|
|
41
41
|
|
|
42
|
-
def dataset_transform(self):
|
|
42
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
43
43
|
self.dataset = self.dataset.rename_columns(
|
|
44
44
|
{"comment": "text", "rating_int": "label"}
|
|
45
45
|
)
|
|
@@ -85,7 +85,7 @@ class CSFDCZMovieReviewSentimentClassificationV2(AbsTaskClassification):
|
|
|
85
85
|
# Increase the samples_per_label in order to improve baseline performance
|
|
86
86
|
samples_per_label = 20
|
|
87
87
|
|
|
88
|
-
def dataset_transform(self):
|
|
88
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
89
89
|
self.dataset = self.stratified_subsampling(
|
|
90
90
|
self.dataset, seed=self.seed, splits=["test"], n_samples=2048
|
|
91
91
|
)
|
|
@@ -60,9 +60,9 @@ Piperidis, Stelios},
|
|
|
60
60
|
|
|
61
61
|
samples_per_label = 16
|
|
62
62
|
|
|
63
|
-
def dataset_transform(self):
|
|
63
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
64
64
|
# convert label to a 0/1 label
|
|
65
|
-
labels = self.dataset["train"]["label"]
|
|
65
|
+
labels = self.dataset["train"]["label"]
|
|
66
66
|
lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
|
|
67
67
|
self.dataset = self.dataset.map(
|
|
68
68
|
lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
|
|
@@ -49,7 +49,7 @@ Zesch, Torsten},
|
|
|
49
49
|
superseded_by="GermanPoliticiansTwitterSentimentClassification.v2",
|
|
50
50
|
)
|
|
51
51
|
|
|
52
|
-
def dataset_transform(self):
|
|
52
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
53
53
|
self.dataset = self.dataset.rename_column("majority_sentiment", "label")
|
|
54
54
|
|
|
55
55
|
|
|
@@ -40,7 +40,7 @@ class DBpediaClassification(AbsTaskClassification):
|
|
|
40
40
|
superseded_by="DBpediaClassification.v2",
|
|
41
41
|
)
|
|
42
42
|
|
|
43
|
-
def dataset_transform(self):
|
|
43
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
44
44
|
self.dataset = self.dataset.rename_column("content", "text")
|
|
45
45
|
self.dataset = self.stratified_subsampling(
|
|
46
46
|
self.dataset, seed=self.seed, splits=["train", "test"]
|
|
@@ -85,7 +85,7 @@ class DBpediaClassificationV2(AbsTaskClassification):
|
|
|
85
85
|
adapted_from=["DBpediaClassification"],
|
|
86
86
|
)
|
|
87
87
|
|
|
88
|
-
def dataset_transform(self):
|
|
88
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
89
89
|
self.dataset = self.stratified_subsampling(
|
|
90
90
|
self.dataset, seed=self.seed, splits=["train", "test"]
|
|
91
91
|
)
|
|
@@ -40,7 +40,7 @@ class ToxicChatClassification(AbsTaskClassification):
|
|
|
40
40
|
superseded_by="ToxicChatClassification.v2",
|
|
41
41
|
)
|
|
42
42
|
|
|
43
|
-
def dataset_transform(self):
|
|
43
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
44
44
|
keep_cols = ["user_input", "toxicity"]
|
|
45
45
|
rename_dict = dict(zip(keep_cols, ["text", "label"]))
|
|
46
46
|
remove_cols = [
|
|
@@ -93,7 +93,7 @@ class ToxicChatClassificationV2(AbsTaskClassification):
|
|
|
93
93
|
adapted_from=["ToxicChatClassification"],
|
|
94
94
|
)
|
|
95
95
|
|
|
96
|
-
def dataset_transform(self):
|
|
96
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
97
97
|
self.dataset = self.stratified_subsampling(
|
|
98
98
|
self.dataset, seed=self.seed, splits=["test"]
|
|
99
99
|
)
|
|
@@ -42,7 +42,7 @@ class ToxicConversationsClassification(AbsTaskClassification):
|
|
|
42
42
|
|
|
43
43
|
samples_per_label = 16
|
|
44
44
|
|
|
45
|
-
def dataset_transform(self):
|
|
45
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
46
46
|
self.dataset = self.stratified_subsampling(
|
|
47
47
|
self.dataset, seed=self.seed, splits=["test"]
|
|
48
48
|
)
|
|
@@ -88,7 +88,7 @@ class ToxicConversationsClassificationV2(AbsTaskClassification):
|
|
|
88
88
|
|
|
89
89
|
samples_per_label = 16
|
|
90
90
|
|
|
91
|
-
def dataset_transform(self):
|
|
91
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
92
92
|
self.dataset = self.stratified_subsampling(
|
|
93
93
|
self.dataset, seed=self.seed, splits=["test"]
|
|
94
94
|
)
|
|
@@ -83,7 +83,7 @@ class YahooAnswersTopicsClassificationV2(AbsTaskClassification):
|
|
|
83
83
|
|
|
84
84
|
samples_per_label = 32
|
|
85
85
|
|
|
86
|
-
def dataset_transform(self):
|
|
86
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
87
87
|
self.dataset = self.stratified_subsampling(
|
|
88
88
|
self.dataset, seed=self.seed, splits=["train", "test"]
|
|
89
89
|
)
|
|
@@ -42,7 +42,7 @@ class YelpReviewFullClassification(AbsTaskClassification):
|
|
|
42
42
|
|
|
43
43
|
samples_per_label = 128
|
|
44
44
|
|
|
45
|
-
def dataset_transform(self):
|
|
45
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
46
46
|
self.dataset = self.stratified_subsampling(
|
|
47
47
|
self.dataset, seed=self.seed, splits=["test"]
|
|
48
48
|
)
|
|
@@ -88,7 +88,7 @@ class YelpReviewFullClassificationV2(AbsTaskClassification):
|
|
|
88
88
|
|
|
89
89
|
samples_per_label = 128
|
|
90
90
|
|
|
91
|
-
def dataset_transform(self):
|
|
91
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
92
92
|
self.dataset = self.stratified_subsampling(
|
|
93
93
|
self.dataset, seed=self.seed, splits=["test"]
|
|
94
94
|
)
|
|
@@ -40,12 +40,12 @@ class EstonianValenceClassification(AbsTaskClassification):
|
|
|
40
40
|
superseded_by="EstonianValenceClassification.v2",
|
|
41
41
|
)
|
|
42
42
|
|
|
43
|
-
def dataset_transform(self):
|
|
43
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
44
44
|
self.dataset = self.dataset.rename_column("paragraph", "text").rename_column(
|
|
45
45
|
"valence", "label"
|
|
46
46
|
)
|
|
47
47
|
# convert label to a numbers
|
|
48
|
-
labels = self.dataset["train"]["label"]
|
|
48
|
+
labels = self.dataset["train"]["label"]
|
|
49
49
|
lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
|
|
50
50
|
self.dataset = self.dataset.map(
|
|
51
51
|
lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
|
|
@@ -602,7 +602,7 @@ class DeepSentiPers(AbsTaskClassification):
|
|
|
602
602
|
)
|
|
603
603
|
samples_per_label = 32
|
|
604
604
|
|
|
605
|
-
def dataset_transform(self):
|
|
605
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
606
606
|
self.dataset = self.dataset.rename_column("review", "text")
|
|
607
607
|
|
|
608
608
|
|
|
@@ -773,7 +773,7 @@ class NLPTwitterAnalysisClassification(AbsTaskClassification):
|
|
|
773
773
|
)
|
|
774
774
|
samples_per_label = 32
|
|
775
775
|
|
|
776
|
-
def dataset_transform(self):
|
|
776
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
777
777
|
self.dataset = self.dataset.rename_column("tweet", "text")
|
|
778
778
|
|
|
779
779
|
|
|
@@ -858,7 +858,7 @@ class FaIntentClassification(AbsTaskClassification):
|
|
|
858
858
|
)
|
|
859
859
|
samples_per_label = 32
|
|
860
860
|
|
|
861
|
-
def dataset_transform(self):
|
|
861
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
862
862
|
self.dataset = self.dataset.rename_column("words", "text")
|
|
863
863
|
self.dataset = self.dataset.rename_column("intent_label", "label")
|
|
864
864
|
|
|
@@ -889,7 +889,7 @@ class StyleClassification(AbsTaskClassification):
|
|
|
889
889
|
)
|
|
890
890
|
samples_per_label = 32
|
|
891
891
|
|
|
892
|
-
def dataset_transform(self):
|
|
892
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
893
893
|
mapping = {"formal": 1, "informal": 0}
|
|
894
894
|
self.dataset = self.dataset.map(
|
|
895
895
|
lambda example: {"label": mapping[example["label"]]}
|
|
@@ -927,7 +927,7 @@ class PerShopDomainClassification(AbsTaskClassification):
|
|
|
927
927
|
)
|
|
928
928
|
samples_per_label = 32
|
|
929
929
|
|
|
930
|
-
def dataset_transform(self):
|
|
930
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
931
931
|
self.dataset = self.dataset.rename_column("domain", "label")
|
|
932
932
|
|
|
933
933
|
|
|
@@ -962,5 +962,5 @@ class PerShopIntentClassification(AbsTaskClassification):
|
|
|
962
962
|
)
|
|
963
963
|
samples_per_label = 32
|
|
964
964
|
|
|
965
|
-
def dataset_transform(self):
|
|
965
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
966
966
|
self.dataset = self.dataset.rename_column("Intents & Actions", "label")
|
|
@@ -37,7 +37,7 @@ class PersianFoodSentimentClassification(AbsTaskClassification):
|
|
|
37
37
|
""",
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
-
def dataset_transform(self):
|
|
40
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
41
41
|
self.dataset = self.stratified_subsampling(
|
|
42
42
|
self.dataset, seed=self.seed, splits=["validation", "test"]
|
|
43
43
|
)
|
|
@@ -36,7 +36,7 @@ class FilipinoShopeeReviewsClassification(AbsTaskClassification):
|
|
|
36
36
|
""",
|
|
37
37
|
)
|
|
38
38
|
|
|
39
|
-
def dataset_transform(self):
|
|
39
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
40
40
|
self.dataset = self.stratified_subsampling(
|
|
41
41
|
self.dataset, seed=self.seed, splits=["validation", "test"]
|
|
42
42
|
)
|
|
@@ -29,7 +29,7 @@ class FrenchBookReviews(AbsTaskClassification):
|
|
|
29
29
|
superseded_by="FrenchBookReviews.v2",
|
|
30
30
|
)
|
|
31
31
|
|
|
32
|
-
def dataset_transform(self):
|
|
32
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
33
33
|
self.dataset = self.dataset.rename_columns({"reader_review": "text"})
|
|
34
34
|
self.dataset = self.stratified_subsampling(
|
|
35
35
|
self.dataset, seed=self.seed, splits=["train"]
|
|
@@ -63,7 +63,7 @@ class FrenchBookReviewsV2(AbsTaskClassification):
|
|
|
63
63
|
adapted_from=["FrenchBookReviews"],
|
|
64
64
|
)
|
|
65
65
|
|
|
66
|
-
def dataset_transform(self):
|
|
66
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
67
67
|
self.dataset = self.stratified_subsampling(
|
|
68
68
|
self.dataset, seed=self.seed, splits=["train"]
|
|
69
69
|
)
|
|
@@ -35,7 +35,7 @@ class MovieReviewSentimentClassification(AbsTaskClassification):
|
|
|
35
35
|
superseded_by="MovieReviewSentimentClassification.v2",
|
|
36
36
|
)
|
|
37
37
|
|
|
38
|
-
def dataset_transform(self):
|
|
38
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
39
39
|
self.dataset = self.dataset.rename_column("review", "text")
|
|
40
40
|
self.dataset = self.stratified_subsampling(
|
|
41
41
|
self.dataset, seed=self.seed, splits=["validation", "test"]
|
|
@@ -75,7 +75,7 @@ class MovieReviewSentimentClassificationV2(AbsTaskClassification):
|
|
|
75
75
|
adapted_from=["MovieReviewSentimentClassification"],
|
|
76
76
|
)
|
|
77
77
|
|
|
78
|
-
def dataset_transform(self):
|
|
78
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
79
79
|
self.dataset = self.stratified_subsampling(
|
|
80
80
|
self.dataset, seed=self.seed, splits=["validation", "test"]
|
|
81
81
|
)
|
|
@@ -28,7 +28,7 @@ class GujaratiNewsClassification(AbsTaskClassification):
|
|
|
28
28
|
superseded_by="GujaratiNewsClassification.v2",
|
|
29
29
|
)
|
|
30
30
|
|
|
31
|
-
def dataset_transform(self):
|
|
31
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
32
32
|
self.dataset = self.dataset.rename_column("headline", "text")
|
|
33
33
|
|
|
34
34
|
|
|
@@ -101,7 +101,7 @@ Stent, Amanda},
|
|
|
101
101
|
adapted_from=["HindiDiscourseClassification"],
|
|
102
102
|
)
|
|
103
103
|
|
|
104
|
-
def dataset_transform(self):
|
|
104
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
105
105
|
self.dataset = self.stratified_subsampling(
|
|
106
106
|
self.dataset, seed=self.seed, splits=["train"]
|
|
107
107
|
)
|
|
@@ -37,7 +37,7 @@ class SentimentAnalysisHindi(AbsTaskClassification):
|
|
|
37
37
|
superseded_by="SentimentAnalysisHindi.v2",
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
-
def dataset_transform(self):
|
|
40
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
41
41
|
self.dataset = self.stratified_subsampling(
|
|
42
42
|
self.dataset, seed=self.seed, splits=["train"]
|
|
43
43
|
)
|