mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +78 -30
- mteb/_evaluators/any_sts_evaluator.py +13 -6
- mteb/_evaluators/clustering_evaluator.py +13 -5
- mteb/_evaluators/evaluator.py +12 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
- mteb/_evaluators/pair_classification_evaluator.py +17 -7
- mteb/_evaluators/retrieval_evaluator.py +23 -14
- mteb/_evaluators/retrieval_metrics.py +26 -19
- mteb/_evaluators/sklearn_evaluator.py +27 -17
- mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
- mteb/_evaluators/text/summarization_evaluator.py +31 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +9 -3
- mteb/abstasks/_data_filter/task_pipelines.py +10 -2
- mteb/abstasks/_statistics_calculation.py +21 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +78 -44
- mteb/abstasks/aggregate_task_metadata.py +21 -18
- mteb/abstasks/aggregated_task.py +23 -35
- mteb/abstasks/classification.py +39 -18
- mteb/abstasks/clustering.py +37 -20
- mteb/abstasks/clustering_legacy.py +30 -16
- mteb/abstasks/image/image_text_pair_classification.py +26 -9
- mteb/abstasks/multilabel_classification.py +33 -21
- mteb/abstasks/pair_classification.py +44 -19
- mteb/abstasks/regression.py +18 -10
- mteb/abstasks/retrieval.py +82 -52
- mteb/abstasks/retrieval_dataset_loaders.py +50 -39
- mteb/abstasks/sts.py +34 -15
- mteb/abstasks/task_metadata.py +44 -37
- mteb/abstasks/text/bitext_mining.py +57 -35
- mteb/abstasks/text/reranking.py +10 -8
- mteb/abstasks/text/summarization.py +26 -10
- mteb/abstasks/zeroshot_classification.py +27 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +15 -3
- mteb/benchmarks/benchmarks/__init__.py +6 -0
- mteb/benchmarks/benchmarks/benchmarks.py +153 -13
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +189 -31
- mteb/cli/_display_tasks.py +10 -4
- mteb/cli/build_cli.py +112 -13
- mteb/cli/generate_model_card.py +50 -23
- mteb/deprecated_evaluator.py +72 -54
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +71 -47
- mteb/filter_tasks.py +36 -32
- mteb/get_tasks.py +37 -33
- mteb/languages/language_scripts.py +11 -4
- mteb/leaderboard/app.py +172 -37
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +20 -14
- mteb/models/abs_encoder.py +30 -16
- mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +16 -11
- mteb/models/get_model_meta.py +53 -9
- mteb/models/instruct_wrapper.py +41 -13
- mteb/models/model_implementations/align_models.py +11 -5
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +6 -4
- mteb/models/model_implementations/ara_models.py +2 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +85 -22
- mteb/models/model_implementations/bica_model.py +4 -3
- mteb/models/model_implementations/blip2_models.py +13 -6
- mteb/models/model_implementations/blip_models.py +33 -20
- mteb/models/model_implementations/bm25.py +27 -17
- mteb/models/model_implementations/bmretriever_models.py +16 -6
- mteb/models/model_implementations/cadet_models.py +2 -1
- mteb/models/model_implementations/cde_models.py +22 -9
- mteb/models/model_implementations/clip_models.py +18 -10
- mteb/models/model_implementations/clips_models.py +6 -3
- mteb/models/model_implementations/codefuse_models.py +10 -5
- mteb/models/model_implementations/codesage_models.py +6 -3
- mteb/models/model_implementations/cohere_models.py +19 -9
- mteb/models/model_implementations/cohere_v.py +16 -6
- mteb/models/model_implementations/colpali_models.py +10 -6
- mteb/models/model_implementations/colqwen_models.py +24 -38
- mteb/models/model_implementations/colsmol_models.py +5 -3
- mteb/models/model_implementations/conan_models.py +12 -5
- mteb/models/model_implementations/dino_models.py +70 -46
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +18 -9
- mteb/models/model_implementations/e5_v.py +16 -10
- mteb/models/model_implementations/eagerworks_models.py +12 -5
- mteb/models/model_implementations/emillykkejensen_models.py +9 -6
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +3 -2
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +18 -9
- mteb/models/model_implementations/facebookai.py +16 -2
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +13 -8
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
- mteb/models/model_implementations/gritlm_models.py +5 -2
- mteb/models/model_implementations/gte_models.py +34 -13
- mteb/models/model_implementations/hinvec_models.py +7 -2
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +16 -7
- mteb/models/model_implementations/jina_clip.py +58 -14
- mteb/models/model_implementations/jina_models.py +35 -16
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +13 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
- mteb/models/model_implementations/kfst.py +2 -1
- mteb/models/model_implementations/kowshik24_models.py +2 -1
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +8 -2
- mteb/models/model_implementations/listconranker.py +11 -5
- mteb/models/model_implementations/llm2clip_models.py +18 -10
- mteb/models/model_implementations/llm2vec_models.py +28 -14
- mteb/models/model_implementations/mcinext_models.py +12 -3
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +131 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
- mteb/models/model_implementations/mme5_models.py +3 -2
- mteb/models/model_implementations/moco_models.py +15 -8
- mteb/models/model_implementations/mod_models.py +3 -2
- mteb/models/model_implementations/model2vec_models.py +37 -18
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +6 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
- mteb/models/model_implementations/nomic_models.py +47 -19
- mteb/models/model_implementations/nomic_models_vision.py +6 -4
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
- mteb/models/model_implementations/nvidia_models.py +165 -22
- mteb/models/model_implementations/octen_models.py +64 -3
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +30 -17
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
- mteb/models/model_implementations/ops_moa_models.py +10 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
- mteb/models/model_implementations/pawan_models.py +2 -1
- mteb/models/model_implementations/piccolo_models.py +3 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +20 -10
- mteb/models/model_implementations/pylate_models.py +41 -21
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +14 -4
- mteb/models/model_implementations/qzhou_models.py +4 -2
- mteb/models/model_implementations/random_baseline.py +7 -6
- mteb/models/model_implementations/rasgaard_models.py +3 -2
- mteb/models/model_implementations/reasonir_model.py +66 -1
- mteb/models/model_implementations/repllama_models.py +18 -9
- mteb/models/model_implementations/rerankers_custom.py +25 -10
- mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +40 -20
- mteb/models/model_implementations/ruri_models.py +20 -10
- mteb/models/model_implementations/salesforce_models.py +13 -4
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +142 -22
- mteb/models/model_implementations/shuu_model.py +2 -1
- mteb/models/model_implementations/siglip_models.py +39 -24
- mteb/models/model_implementations/slm_models.py +419 -0
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +4 -2
- mteb/models/model_implementations/text2vec_models.py +12 -3
- mteb/models/model_implementations/ua_sentence_models.py +2 -1
- mteb/models/model_implementations/uae_models.py +17 -5
- mteb/models/model_implementations/vdr_models.py +9 -2
- mteb/models/model_implementations/vi_vn_models.py +12 -6
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +14 -7
- mteb/models/model_implementations/voyage_models.py +136 -4
- mteb/models/model_implementations/voyage_v.py +17 -10
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +2 -1
- mteb/models/model_implementations/yuan_models_en.py +3 -2
- mteb/models/model_meta.py +127 -40
- mteb/models/models_protocols.py +43 -22
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
- mteb/models/search_wrappers.py +63 -29
- mteb/models/sentence_transformer_wrapper.py +52 -26
- mteb/models/vllm_wrapper.py +329 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +48 -35
- mteb/results/model_result.py +68 -32
- mteb/results/task_result.py +110 -72
- mteb/similarity_functions.py +19 -9
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +2 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +16 -16
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +44 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +3 -3
- mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +13 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +18 -5
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
|
@@ -2,9 +2,8 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import hashlib
|
|
4
4
|
from collections import Counter
|
|
5
|
-
from typing import TYPE_CHECKING
|
|
5
|
+
from typing import TYPE_CHECKING, cast
|
|
6
6
|
|
|
7
|
-
from mteb.types import TopRankedDocumentsType
|
|
8
7
|
from mteb.types.statistics import (
|
|
9
8
|
ImageStatistics,
|
|
10
9
|
LabelStatistics,
|
|
@@ -15,8 +14,12 @@ from mteb.types.statistics import (
|
|
|
15
14
|
)
|
|
16
15
|
|
|
17
16
|
if TYPE_CHECKING:
|
|
17
|
+
from collections.abc import Mapping
|
|
18
|
+
|
|
18
19
|
from PIL import Image
|
|
19
20
|
|
|
21
|
+
from mteb.types import TopRankedDocumentsType
|
|
22
|
+
|
|
20
23
|
|
|
21
24
|
def calculate_text_statistics(texts: list[str]) -> TextStatistics:
|
|
22
25
|
"""Calculate descriptive statistics for a list of texts.
|
|
@@ -52,7 +55,7 @@ def calculate_image_statistics(images: list[Image.Image]) -> ImageStatistics:
|
|
|
52
55
|
seen_hashes: set[str] = set()
|
|
53
56
|
|
|
54
57
|
for img in images:
|
|
55
|
-
width, height = img.size
|
|
58
|
+
width, height = img.size
|
|
56
59
|
img_heights.append(height)
|
|
57
60
|
img_widths.append(width)
|
|
58
61
|
|
|
@@ -82,17 +85,24 @@ def calculate_label_statistics(labels: list[int | list[int]]) -> LabelStatistics
|
|
|
82
85
|
LabelStatistics: A dictionary containing the descriptive statistics.
|
|
83
86
|
|
|
84
87
|
"""
|
|
88
|
+
total_labels: list[int | None] = []
|
|
89
|
+
|
|
85
90
|
if not isinstance(labels[0], list):
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
91
|
+
# single label classification
|
|
92
|
+
single_label = cast("list[int]", labels)
|
|
93
|
+
label_len = [1] * len(single_label)
|
|
94
|
+
total_label_len = len(single_label)
|
|
95
|
+
total_labels.extend(single_label)
|
|
89
96
|
elif isinstance(labels[0], list):
|
|
90
97
|
# multilabel classification
|
|
91
|
-
|
|
98
|
+
multilabel_labels = cast("list[list[int]]", labels)
|
|
99
|
+
label_len = [len(l) for l in multilabel_labels]
|
|
92
100
|
total_label_len = sum(label_len)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
101
|
+
for l in multilabel_labels:
|
|
102
|
+
if l and len(l) > 0:
|
|
103
|
+
total_labels.extend(l)
|
|
104
|
+
else:
|
|
105
|
+
total_labels.append(None)
|
|
96
106
|
else:
|
|
97
107
|
raise ValueError(
|
|
98
108
|
"Labels must be a list of integers or a list of lists of integers."
|
|
@@ -159,7 +169,7 @@ def calculate_top_ranked_statistics(
|
|
|
159
169
|
|
|
160
170
|
|
|
161
171
|
def calculate_relevant_docs_statistics(
|
|
162
|
-
relevant_docs:
|
|
172
|
+
relevant_docs: Mapping[str, Mapping[str, int]],
|
|
163
173
|
) -> RelevantDocsStatistics:
|
|
164
174
|
qrels_lengths = [len(relevant_docs[qid]) for qid in relevant_docs]
|
|
165
175
|
unique_qrels = len({doc for qid in relevant_docs for doc in relevant_docs[qid]})
|
mteb/abstasks/_stratification.py
CHANGED
|
@@ -39,6 +39,7 @@ Bibtex:
|
|
|
39
39
|
"""
|
|
40
40
|
|
|
41
41
|
import itertools
|
|
42
|
+
from typing import Any
|
|
42
43
|
|
|
43
44
|
import numpy as np
|
|
44
45
|
import scipy.sparse as sp
|
|
@@ -119,8 +120,10 @@ def _get_most_desired_combination(samples_with_combination: dict):
|
|
|
119
120
|
if support_size == 0:
|
|
120
121
|
continue
|
|
121
122
|
if currently_chosen is None or (
|
|
122
|
-
best_number_of_combinations
|
|
123
|
-
and best_support_size
|
|
123
|
+
best_number_of_combinations is not None
|
|
124
|
+
and best_support_size is not None
|
|
125
|
+
and best_number_of_combinations < number_of_combinations
|
|
126
|
+
and best_support_size > support_size
|
|
124
127
|
):
|
|
125
128
|
currently_chosen = combination
|
|
126
129
|
best_number_of_combinations, best_support_size = (
|
|
@@ -162,7 +165,7 @@ class IterativeStratification(_BaseKFold):
|
|
|
162
165
|
self._rng_state = check_random_state(random_state)
|
|
163
166
|
need_shuffle = shuffle or random_state is not None
|
|
164
167
|
self.order = order
|
|
165
|
-
super().__init__(
|
|
168
|
+
super().__init__(
|
|
166
169
|
n_splits,
|
|
167
170
|
shuffle=need_shuffle,
|
|
168
171
|
random_state=self._rng_state if need_shuffle else None,
|
|
@@ -172,8 +175,7 @@ class IterativeStratification(_BaseKFold):
|
|
|
172
175
|
self.percentage_per_fold = sample_distribution_per_fold
|
|
173
176
|
else:
|
|
174
177
|
self.percentage_per_fold = [
|
|
175
|
-
1 / float(self.n_splits)
|
|
176
|
-
for _ in range(self.n_splits) # type: ignore
|
|
178
|
+
1 / float(self.n_splits) for _ in range(self.n_splits)
|
|
177
179
|
]
|
|
178
180
|
|
|
179
181
|
def _prepare_stratification(
|
|
@@ -182,9 +184,9 @@ class IterativeStratification(_BaseKFold):
|
|
|
182
184
|
list[list[int]],
|
|
183
185
|
dict[int, bool],
|
|
184
186
|
list[list[int]],
|
|
185
|
-
list[list[
|
|
186
|
-
dict[
|
|
187
|
-
list[list[
|
|
187
|
+
list[list[Any]],
|
|
188
|
+
dict[str, list[Any]],
|
|
189
|
+
list[list[Any]],
|
|
188
190
|
]:
|
|
189
191
|
"""Prepares variables for performing stratification
|
|
190
192
|
|
|
@@ -206,14 +208,14 @@ class IterativeStratification(_BaseKFold):
|
|
|
206
208
|
"""
|
|
207
209
|
self.n_samples, self.n_labels = y.shape
|
|
208
210
|
self.desired_samples_per_fold = np.array(
|
|
209
|
-
[self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]
|
|
211
|
+
[self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]
|
|
210
212
|
)
|
|
211
213
|
rows = sp.lil_matrix(y).rows
|
|
212
214
|
rows_used = dict.fromkeys(range(self.n_samples), False)
|
|
213
215
|
all_combinations = []
|
|
214
|
-
per_row_combinations = [[] for i in range(self.n_samples)]
|
|
215
|
-
samples_with_combination = {}
|
|
216
|
-
folds = [[] for _ in range(self.n_splits)]
|
|
216
|
+
per_row_combinations: list[list[Any]] = [[] for i in range(self.n_samples)]
|
|
217
|
+
samples_with_combination: dict[str, list[Any]] = {}
|
|
218
|
+
folds: list[list[int]] = [[] for _ in range(self.n_splits)]
|
|
217
219
|
|
|
218
220
|
# for every row
|
|
219
221
|
for sample_index, label_assignment in enumerate(rows):
|
|
@@ -229,21 +231,19 @@ class IterativeStratification(_BaseKFold):
|
|
|
229
231
|
all_combinations.append(combination)
|
|
230
232
|
per_row_combinations[sample_index].append(combination)
|
|
231
233
|
|
|
232
|
-
all_combinations = [list(x) for x in set(all_combinations)]
|
|
233
|
-
|
|
234
234
|
self.desired_samples_per_combination_per_fold = {
|
|
235
235
|
combination: np.array(
|
|
236
236
|
[
|
|
237
237
|
len(evidence_for_combination) * self.percentage_per_fold[j]
|
|
238
|
-
for j in range(self.n_splits)
|
|
238
|
+
for j in range(self.n_splits)
|
|
239
239
|
]
|
|
240
240
|
)
|
|
241
241
|
for combination, evidence_for_combination in samples_with_combination.items()
|
|
242
242
|
}
|
|
243
243
|
return (
|
|
244
|
-
rows,
|
|
244
|
+
rows.tolist(),
|
|
245
245
|
rows_used,
|
|
246
|
-
all_combinations,
|
|
246
|
+
[list(x) for x in set(all_combinations)],
|
|
247
247
|
per_row_combinations,
|
|
248
248
|
samples_with_combination,
|
|
249
249
|
folds,
|
|
@@ -328,7 +328,7 @@ class IterativeStratification(_BaseKFold):
|
|
|
328
328
|
per_row_combinations,
|
|
329
329
|
samples_with_combination,
|
|
330
330
|
folds,
|
|
331
|
-
) = self._prepare_stratification(y)
|
|
331
|
+
) = self._prepare_stratification(y)
|
|
332
332
|
|
|
333
333
|
self._distribute_positive_evidence(
|
|
334
334
|
rows_used, folds, samples_with_combination, per_row_combinations
|
mteb/abstasks/abstask.py
CHANGED
|
@@ -1,28 +1,38 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import logging
|
|
5
|
+
import warnings
|
|
3
6
|
from abc import ABC, abstractmethod
|
|
4
7
|
from collections.abc import Sequence
|
|
5
8
|
from copy import copy
|
|
6
9
|
from pathlib import Path
|
|
7
|
-
from typing import Any, cast
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
8
11
|
|
|
9
12
|
import numpy as np
|
|
10
13
|
from datasets import ClassLabel, Dataset, DatasetDict, load_dataset
|
|
11
14
|
from sklearn.preprocessing import MultiLabelBinarizer
|
|
12
15
|
from tqdm.auto import tqdm
|
|
13
|
-
from typing_extensions import Self
|
|
14
16
|
|
|
15
17
|
from mteb._set_seed import _set_seed
|
|
16
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
18
|
from mteb.languages import LanguageScripts
|
|
18
19
|
from mteb.models import (
|
|
19
20
|
CrossEncoderProtocol,
|
|
20
21
|
EncoderProtocol,
|
|
21
|
-
MTEBModels,
|
|
22
22
|
SearchProtocol,
|
|
23
23
|
)
|
|
24
|
-
|
|
25
|
-
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from collections.abc import Mapping
|
|
27
|
+
|
|
28
|
+
from typing_extensions import Self
|
|
29
|
+
|
|
30
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
31
|
+
from mteb.models import (
|
|
32
|
+
MTEBModels,
|
|
33
|
+
)
|
|
34
|
+
from mteb.types import EncodeKwargs, HFSubset, Modalities, ScoresDict
|
|
35
|
+
from mteb.types.statistics import DescriptiveStatistics, SplitDescriptiveStatistics
|
|
26
36
|
|
|
27
37
|
logger = logging.getLogger(__name__)
|
|
28
38
|
|
|
@@ -78,8 +88,8 @@ class AbsTask(ABC):
|
|
|
78
88
|
"""
|
|
79
89
|
|
|
80
90
|
metadata: TaskMetadata
|
|
81
|
-
abstask_prompt: str
|
|
82
|
-
_eval_splits:
|
|
91
|
+
abstask_prompt: str
|
|
92
|
+
_eval_splits: Sequence[str] | None = None
|
|
83
93
|
dataset: dict[HFSubset, DatasetDict] | None = None
|
|
84
94
|
data_loaded: bool = False
|
|
85
95
|
hf_subsets: list[HFSubset]
|
|
@@ -102,15 +112,18 @@ class AbsTask(ABC):
|
|
|
102
112
|
def check_if_dataset_is_superseded(self) -> None:
|
|
103
113
|
"""Check if the dataset is superseded by a newer version."""
|
|
104
114
|
if self.superseded_by:
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
)
|
|
115
|
+
msg = f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}'. We recommend using the newer version of the dataset unless you are running a specific benchmark. See `get_task('{self.superseded_by}').metadata.description` to get a description of the task and changes."
|
|
116
|
+
logger.warning(msg)
|
|
117
|
+
warnings.warn(msg)
|
|
108
118
|
|
|
109
|
-
def dataset_transform(self):
|
|
119
|
+
def dataset_transform(self, num_proc: int = 1):
|
|
110
120
|
"""A transform operations applied to the dataset after loading.
|
|
111
121
|
|
|
112
122
|
This method is useful when the dataset from Huggingface is not in an `mteb` compatible format.
|
|
113
123
|
Override this method if your dataset requires additional transformation.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
num_proc: Number of processes to use for the transformation.
|
|
114
127
|
"""
|
|
115
128
|
pass
|
|
116
129
|
|
|
@@ -120,10 +133,11 @@ class AbsTask(ABC):
|
|
|
120
133
|
split: str = "test",
|
|
121
134
|
subsets_to_run: list[HFSubset] | None = None,
|
|
122
135
|
*,
|
|
123
|
-
encode_kwargs:
|
|
136
|
+
encode_kwargs: EncodeKwargs,
|
|
124
137
|
prediction_folder: Path | None = None,
|
|
138
|
+
num_proc: int = 1,
|
|
125
139
|
**kwargs: Any,
|
|
126
|
-
) ->
|
|
140
|
+
) -> Mapping[HFSubset, ScoresDict]:
|
|
127
141
|
"""Evaluates an MTEB compatible model on the task.
|
|
128
142
|
|
|
129
143
|
Args:
|
|
@@ -132,6 +146,7 @@ class AbsTask(ABC):
|
|
|
132
146
|
subsets_to_run: List of huggingface subsets (HFSubsets) to evaluate. If None, all subsets are evaluated.
|
|
133
147
|
encode_kwargs: Additional keyword arguments that are passed to the model's `encode` method.
|
|
134
148
|
prediction_folder: Folder to save model predictions
|
|
149
|
+
num_proc: Number of processes to use for loading the dataset or processing.
|
|
135
150
|
kwargs: Additional keyword arguments that are passed to the _evaluate_subset method.
|
|
136
151
|
|
|
137
152
|
Returns:
|
|
@@ -161,7 +176,7 @@ class AbsTask(ABC):
|
|
|
161
176
|
if not self.data_loaded:
|
|
162
177
|
self.load_data()
|
|
163
178
|
|
|
164
|
-
self.dataset = cast(dict[HFSubset, DatasetDict], self.dataset)
|
|
179
|
+
self.dataset = cast("dict[HFSubset, DatasetDict]", self.dataset)
|
|
165
180
|
|
|
166
181
|
scores = {}
|
|
167
182
|
if self.hf_subsets is None:
|
|
@@ -187,6 +202,7 @@ class AbsTask(ABC):
|
|
|
187
202
|
hf_subset=hf_subset,
|
|
188
203
|
encode_kwargs=encode_kwargs,
|
|
189
204
|
prediction_folder=prediction_folder,
|
|
205
|
+
num_proc=num_proc,
|
|
190
206
|
**kwargs,
|
|
191
207
|
)
|
|
192
208
|
self._add_main_score(scores[hf_subset])
|
|
@@ -195,13 +211,14 @@ class AbsTask(ABC):
|
|
|
195
211
|
@abstractmethod
|
|
196
212
|
def _evaluate_subset(
|
|
197
213
|
self,
|
|
198
|
-
model:
|
|
214
|
+
model: MTEBModels,
|
|
199
215
|
data_split: Dataset,
|
|
200
216
|
*,
|
|
201
|
-
encode_kwargs: dict[str, Any],
|
|
202
217
|
hf_split: str,
|
|
203
218
|
hf_subset: str,
|
|
219
|
+
encode_kwargs: EncodeKwargs,
|
|
204
220
|
prediction_folder: Path | None = None,
|
|
221
|
+
num_proc: int = 1,
|
|
205
222
|
**kwargs: Any,
|
|
206
223
|
) -> ScoresDict:
|
|
207
224
|
raise NotImplementedError(
|
|
@@ -210,7 +227,7 @@ class AbsTask(ABC):
|
|
|
210
227
|
|
|
211
228
|
def _save_task_predictions(
|
|
212
229
|
self,
|
|
213
|
-
predictions:
|
|
230
|
+
predictions: Mapping[str, Any] | list[Any],
|
|
214
231
|
model: MTEBModels,
|
|
215
232
|
prediction_folder: Path,
|
|
216
233
|
hf_split: str,
|
|
@@ -226,7 +243,7 @@ class AbsTask(ABC):
|
|
|
226
243
|
hf_subset: The subset of the dataset (e.g. "en").
|
|
227
244
|
"""
|
|
228
245
|
predictions_path = self._predictions_path(prediction_folder)
|
|
229
|
-
existing_results = {
|
|
246
|
+
existing_results: dict[str, Any] = {
|
|
230
247
|
"mteb_model_meta": {
|
|
231
248
|
"model_name": model.mteb_model_meta.name,
|
|
232
249
|
"revision": model.mteb_model_meta.revision,
|
|
@@ -306,11 +323,15 @@ class AbsTask(ABC):
|
|
|
306
323
|
) # only take the specified test split.
|
|
307
324
|
return dataset_dict
|
|
308
325
|
|
|
309
|
-
def load_data(self) -> None:
|
|
326
|
+
def load_data(self, num_proc: int = 1, **kwargs: Any) -> None:
|
|
310
327
|
"""Loads dataset from HuggingFace hub
|
|
311
328
|
|
|
312
329
|
This is the main loading function for Task. Do not overwrite this, instead we recommend using `dataset_transform`, which is called after the
|
|
313
330
|
dataset is loaded using `datasets.load_dataset`.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
num_proc: Number of processes to use for loading the dataset.
|
|
334
|
+
kwargs: Additional keyword arguments passed to the load_dataset function. Keep for forward compatibility.
|
|
314
335
|
"""
|
|
315
336
|
if self.data_loaded:
|
|
316
337
|
return
|
|
@@ -323,11 +344,12 @@ class AbsTask(ABC):
|
|
|
323
344
|
self.dataset[hf_subset] = load_dataset(
|
|
324
345
|
name=hf_subset,
|
|
325
346
|
**self.metadata.dataset,
|
|
347
|
+
num_proc=num_proc,
|
|
326
348
|
)
|
|
327
349
|
else:
|
|
328
350
|
# some of monolingual datasets explicitly adding the split name to the dataset name
|
|
329
|
-
self.dataset = load_dataset(**self.metadata.dataset)
|
|
330
|
-
self.dataset_transform()
|
|
351
|
+
self.dataset = load_dataset(**self.metadata.dataset, num_proc=num_proc)
|
|
352
|
+
self.dataset_transform(num_proc=num_proc)
|
|
331
353
|
self.data_loaded = True
|
|
332
354
|
|
|
333
355
|
def fast_load(self) -> None:
|
|
@@ -350,27 +372,32 @@ class AbsTask(ABC):
|
|
|
350
372
|
self.dataset[lang] = DatasetDict(subset)
|
|
351
373
|
|
|
352
374
|
def calculate_descriptive_statistics(
|
|
353
|
-
self, overwrite_results: bool = False
|
|
375
|
+
self, overwrite_results: bool = False, num_proc: int = 1
|
|
354
376
|
) -> dict[str, DescriptiveStatistics]:
|
|
355
377
|
"""Calculates descriptive statistics from the dataset.
|
|
356
378
|
|
|
357
379
|
Args:
|
|
358
380
|
overwrite_results: Whether to overwrite existing results. If False and results already exist, the existing results will be loaded from cache.
|
|
381
|
+
num_proc: Number of processes to use for loading the dataset.
|
|
359
382
|
|
|
360
383
|
Returns:
|
|
361
384
|
A dictionary containing descriptive statistics for each split.
|
|
362
385
|
"""
|
|
363
386
|
from mteb.abstasks import AbsTaskClassification
|
|
364
387
|
|
|
365
|
-
|
|
388
|
+
existing_stats = self.metadata.descriptive_stats
|
|
389
|
+
|
|
390
|
+
if existing_stats is not None and not overwrite_results:
|
|
366
391
|
logger.info("Loading metadata descriptive statistics from cache.")
|
|
367
|
-
return
|
|
392
|
+
return existing_stats
|
|
368
393
|
|
|
369
394
|
if not self.data_loaded:
|
|
370
|
-
self.load_data()
|
|
395
|
+
self.load_data(num_proc=num_proc)
|
|
371
396
|
|
|
372
397
|
descriptive_stats: dict[str, DescriptiveStatistics] = {}
|
|
373
|
-
hf_subset_stat
|
|
398
|
+
hf_subset_stat: Literal["hf_subset_descriptive_stats"] = (
|
|
399
|
+
"hf_subset_descriptive_stats"
|
|
400
|
+
)
|
|
374
401
|
eval_splits = self.metadata.eval_splits
|
|
375
402
|
if isinstance(self, AbsTaskClassification):
|
|
376
403
|
eval_splits.append(self.train_split)
|
|
@@ -381,7 +408,7 @@ class AbsTask(ABC):
|
|
|
381
408
|
logger.info(f"Processing metadata for split {split}")
|
|
382
409
|
if self.metadata.is_multilingual:
|
|
383
410
|
descriptive_stats[split] = (
|
|
384
|
-
self._calculate_descriptive_statistics_from_split(
|
|
411
|
+
self._calculate_descriptive_statistics_from_split( # type: ignore[assignment]
|
|
385
412
|
split, compute_overall=True
|
|
386
413
|
)
|
|
387
414
|
)
|
|
@@ -400,7 +427,7 @@ class AbsTask(ABC):
|
|
|
400
427
|
descriptive_stats[split][hf_subset_stat][hf_subset] = split_details
|
|
401
428
|
else:
|
|
402
429
|
split_details = self._calculate_descriptive_statistics_from_split(split)
|
|
403
|
-
descriptive_stats[split] = split_details
|
|
430
|
+
descriptive_stats[split] = split_details # type: ignore[assignment]
|
|
404
431
|
|
|
405
432
|
with self.metadata.descriptive_stat_path.open("w") as f:
|
|
406
433
|
json.dump(descriptive_stats, f, indent=4)
|
|
@@ -437,7 +464,7 @@ class AbsTask(ABC):
|
|
|
437
464
|
|
|
438
465
|
return self.metadata.languages
|
|
439
466
|
|
|
440
|
-
def filter_eval_splits(self, eval_splits:
|
|
467
|
+
def filter_eval_splits(self, eval_splits: Sequence[str] | None) -> Self:
|
|
441
468
|
"""Filter the evaluation splits of the task.
|
|
442
469
|
|
|
443
470
|
Args:
|
|
@@ -451,9 +478,9 @@ class AbsTask(ABC):
|
|
|
451
478
|
|
|
452
479
|
def filter_languages(
|
|
453
480
|
self,
|
|
454
|
-
languages:
|
|
455
|
-
script:
|
|
456
|
-
hf_subsets:
|
|
481
|
+
languages: Sequence[str] | None,
|
|
482
|
+
script: Sequence[str] | None = None,
|
|
483
|
+
hf_subsets: Sequence[HFSubset] | None = None,
|
|
457
484
|
exclusive_language_filter: bool = False,
|
|
458
485
|
) -> Self:
|
|
459
486
|
"""Filter the languages of the task.
|
|
@@ -499,12 +526,14 @@ class AbsTask(ABC):
|
|
|
499
526
|
self.hf_subsets = subsets_to_keep
|
|
500
527
|
return self
|
|
501
528
|
|
|
502
|
-
def _add_main_score(self, scores:
|
|
529
|
+
def _add_main_score(self, scores: ScoresDict) -> None:
|
|
503
530
|
scores["main_score"] = scores[self.metadata.main_score]
|
|
504
531
|
|
|
505
532
|
def _upload_dataset_to_hub(
|
|
506
|
-
self, repo_name: str, fields: list[str] | dict[str, str]
|
|
533
|
+
self, repo_name: str, fields: list[str] | dict[str, str], num_proc: int = 1
|
|
507
534
|
) -> None:
|
|
535
|
+
if self.dataset is None:
|
|
536
|
+
raise ValueError("Dataset not loaded")
|
|
508
537
|
if self.metadata.is_multilingual:
|
|
509
538
|
for config in self.metadata.eval_langs:
|
|
510
539
|
logger.info(f"Converting {config} of {self.metadata.name}")
|
|
@@ -526,7 +555,10 @@ class AbsTask(ABC):
|
|
|
526
555
|
)
|
|
527
556
|
sentences = DatasetDict(sentences)
|
|
528
557
|
sentences.push_to_hub(
|
|
529
|
-
repo_name,
|
|
558
|
+
repo_name,
|
|
559
|
+
config,
|
|
560
|
+
commit_message=f"Add {config} dataset",
|
|
561
|
+
num_proc=num_proc,
|
|
530
562
|
)
|
|
531
563
|
else:
|
|
532
564
|
sentences = {}
|
|
@@ -543,16 +575,19 @@ class AbsTask(ABC):
|
|
|
543
575
|
{field: self.dataset[split][field] for field in fields}
|
|
544
576
|
)
|
|
545
577
|
sentences = DatasetDict(sentences)
|
|
546
|
-
sentences.push_to_hub(
|
|
578
|
+
sentences.push_to_hub(
|
|
579
|
+
repo_name, commit_message="Add dataset", num_proc=num_proc
|
|
580
|
+
)
|
|
547
581
|
|
|
548
|
-
def _push_dataset_to_hub(self, repo_name: str) -> None:
|
|
582
|
+
def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
|
|
549
583
|
raise NotImplementedError
|
|
550
584
|
|
|
551
|
-
def push_dataset_to_hub(self, repo_name: str) -> None:
|
|
585
|
+
def push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
|
|
552
586
|
"""Push the dataset to the HuggingFace Hub.
|
|
553
587
|
|
|
554
588
|
Args:
|
|
555
589
|
repo_name: The name of the repository to push the dataset to.
|
|
590
|
+
num_proc: Number of processes to use for loading the dataset.
|
|
556
591
|
|
|
557
592
|
Examples:
|
|
558
593
|
>>> import mteb
|
|
@@ -564,7 +599,7 @@ class AbsTask(ABC):
|
|
|
564
599
|
if not self.data_loaded:
|
|
565
600
|
self.load_data()
|
|
566
601
|
|
|
567
|
-
self._push_dataset_to_hub(repo_name)
|
|
602
|
+
self._push_dataset_to_hub(repo_name, num_proc)
|
|
568
603
|
# dataset repo not creating when pushing card
|
|
569
604
|
self.metadata.push_dataset_card_to_hub(repo_name)
|
|
570
605
|
|
|
@@ -574,7 +609,7 @@ class AbsTask(ABC):
|
|
|
574
609
|
return False
|
|
575
610
|
|
|
576
611
|
@property
|
|
577
|
-
def eval_splits(self) ->
|
|
612
|
+
def eval_splits(self) -> Sequence[str]:
|
|
578
613
|
"""Returns the evaluation splits of the task."""
|
|
579
614
|
if self._eval_splits:
|
|
580
615
|
return self._eval_splits
|
|
@@ -607,9 +642,8 @@ class AbsTask(ABC):
|
|
|
607
642
|
self.data_loaded = False
|
|
608
643
|
logger.info(f"Unloaded dataset {self.metadata.name} from memory.")
|
|
609
644
|
else:
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
)
|
|
645
|
+
msg = f"Dataset `{self.metadata.name}` is not loaded, cannot unload it."
|
|
646
|
+
logger.warning(msg)
|
|
613
647
|
|
|
614
648
|
@property
|
|
615
649
|
def superseded_by(self) -> str | None:
|
|
@@ -1,29 +1,39 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
from datetime import datetime
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
3
6
|
|
|
4
7
|
from pydantic import ConfigDict, Field, model_validator
|
|
5
|
-
from typing_extensions import Self
|
|
6
8
|
|
|
7
9
|
from mteb.types import (
|
|
8
|
-
HFSubset,
|
|
9
|
-
ISOLanguageScript,
|
|
10
10
|
Languages,
|
|
11
|
-
Licenses,
|
|
12
|
-
Modalities,
|
|
13
|
-
StrDate,
|
|
14
11
|
)
|
|
15
12
|
|
|
16
13
|
from .abstask import AbsTask
|
|
17
14
|
from .task_metadata import (
|
|
18
|
-
AnnotatorType,
|
|
19
15
|
MetadataDatasetDict,
|
|
20
|
-
SampleCreationMethod,
|
|
21
|
-
TaskDomain,
|
|
22
16
|
TaskMetadata,
|
|
23
|
-
TaskSubtype,
|
|
24
17
|
TaskType,
|
|
25
18
|
)
|
|
26
19
|
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from typing_extensions import Self
|
|
22
|
+
|
|
23
|
+
from mteb.types import (
|
|
24
|
+
ISOLanguageScript,
|
|
25
|
+
Licenses,
|
|
26
|
+
Modalities,
|
|
27
|
+
StrDate,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
from .task_metadata import (
|
|
31
|
+
AnnotatorType,
|
|
32
|
+
SampleCreationMethod,
|
|
33
|
+
TaskDomain,
|
|
34
|
+
TaskSubtype,
|
|
35
|
+
)
|
|
36
|
+
|
|
27
37
|
logger = logging.getLogger(__name__)
|
|
28
38
|
|
|
29
39
|
|
|
@@ -60,14 +70,7 @@ class AggregateTaskMetadata(TaskMetadata):
|
|
|
60
70
|
reference: str | None = None
|
|
61
71
|
bibtex_citation: str | None = None
|
|
62
72
|
|
|
63
|
-
@
|
|
64
|
-
def hf_subsets_to_langscripts(self) -> dict[HFSubset, list[ISOLanguageScript]]:
|
|
65
|
-
"""Return a dictionary mapping huggingface subsets to languages."""
|
|
66
|
-
if isinstance(self.eval_langs, dict):
|
|
67
|
-
return self.eval_langs
|
|
68
|
-
return {"default": self.eval_langs} # type: ignore
|
|
69
|
-
|
|
70
|
-
@model_validator(mode="after") # type: ignore
|
|
73
|
+
@model_validator(mode="after")
|
|
71
74
|
def _compute_unfilled_cases(self) -> Self:
|
|
72
75
|
if not self.eval_langs:
|
|
73
76
|
self.eval_langs = self._compute_eval_langs()
|
mteb/abstasks/aggregated_task.py
CHANGED
|
@@ -1,18 +1,26 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
|
|
3
|
-
from typing import Any
|
|
4
|
+
import warnings
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
|
-
from datasets import Dataset, DatasetDict
|
|
7
|
-
from typing_extensions import Self
|
|
8
8
|
|
|
9
|
-
from mteb.models.models_protocols import MTEBModels
|
|
10
9
|
from mteb.results.task_result import TaskResult
|
|
11
|
-
from mteb.types import HFSubset, ScoresDict
|
|
12
|
-
from mteb.types.statistics import DescriptiveStatistics
|
|
13
10
|
|
|
14
11
|
from .abstask import AbsTask
|
|
15
|
-
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Mapping
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from datasets import Dataset, DatasetDict
|
|
18
|
+
|
|
19
|
+
from mteb.models.models_protocols import MTEBModels
|
|
20
|
+
from mteb.types import EncodeKwargs, HFSubset, ScoresDict
|
|
21
|
+
from mteb.types.statistics import DescriptiveStatistics
|
|
22
|
+
|
|
23
|
+
from .aggregate_task_metadata import AggregateTaskMetadata
|
|
16
24
|
|
|
17
25
|
logger = logging.getLogger(__name__)
|
|
18
26
|
|
|
@@ -32,7 +40,7 @@ class AbsTaskAggregate(AbsTask):
|
|
|
32
40
|
|
|
33
41
|
def task_results_to_scores(
|
|
34
42
|
self, task_results: list[TaskResult]
|
|
35
|
-
) -> dict[str,
|
|
43
|
+
) -> dict[str, Mapping[HFSubset, ScoresDict]]:
|
|
36
44
|
"""The function that aggregated scores. Can be redefined to allow for custom aggregations.
|
|
37
45
|
|
|
38
46
|
Args:
|
|
@@ -41,7 +49,7 @@ class AbsTaskAggregate(AbsTask):
|
|
|
41
49
|
Returns:
|
|
42
50
|
A dictionary with the aggregated scores.
|
|
43
51
|
"""
|
|
44
|
-
scores = {}
|
|
52
|
+
scores: dict[str, Mapping[HFSubset, ScoresDict]] = {}
|
|
45
53
|
subsets = (
|
|
46
54
|
self.metadata.eval_langs.keys()
|
|
47
55
|
if isinstance(self.metadata.eval_langs, dict)
|
|
@@ -113,40 +121,20 @@ class AbsTaskAggregate(AbsTask):
|
|
|
113
121
|
)
|
|
114
122
|
mteb_versions = {tr.mteb_version for tr in task_results}
|
|
115
123
|
if len(mteb_versions) != 1:
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
)
|
|
124
|
+
msg = f"All tasks of {self.metadata.name} is not run using the same version. different versions found are: {mteb_versions}"
|
|
125
|
+
logger.warning(msg)
|
|
126
|
+
warnings.warn(msg)
|
|
119
127
|
task_res.mteb_version = None
|
|
120
128
|
task_res.mteb_version = task_results[0].mteb_version
|
|
121
129
|
return task_res
|
|
122
130
|
|
|
123
|
-
def check_if_dataset_is_superseded(self) -> None:
|
|
124
|
-
"""Check if the dataset is superseded by a newer version"""
|
|
125
|
-
if self.superseded_by:
|
|
126
|
-
logger.warning(
|
|
127
|
-
f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}', you might consider using the newer version of the dataset."
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
def filter_eval_splits(self, eval_splits: list[str] | None) -> Self:
|
|
131
|
-
"""Filter the evaluation splits of the task.
|
|
132
|
-
|
|
133
|
-
Args:
|
|
134
|
-
eval_splits: List of splits to evaluate on. If None, all splits in metadata
|
|
135
|
-
are used.
|
|
136
|
-
|
|
137
|
-
Returns:
|
|
138
|
-
The task with filtered evaluation splits.
|
|
139
|
-
"""
|
|
140
|
-
self._eval_splits = eval_splits
|
|
141
|
-
return self
|
|
142
|
-
|
|
143
131
|
def evaluate(
|
|
144
132
|
self,
|
|
145
133
|
model: MTEBModels,
|
|
146
134
|
split: str = "test",
|
|
147
135
|
subsets_to_run: list[HFSubset] | None = None,
|
|
148
136
|
*,
|
|
149
|
-
encode_kwargs:
|
|
137
|
+
encode_kwargs: EncodeKwargs,
|
|
150
138
|
prediction_folder: Path | None = None,
|
|
151
139
|
**kwargs: Any,
|
|
152
140
|
) -> dict[HFSubset, ScoresDict]:
|
|
@@ -160,7 +148,7 @@ class AbsTaskAggregate(AbsTask):
|
|
|
160
148
|
self,
|
|
161
149
|
model: MTEBModels,
|
|
162
150
|
data_split: DatasetDict | Dataset,
|
|
163
|
-
encode_kwargs:
|
|
151
|
+
encode_kwargs: EncodeKwargs,
|
|
164
152
|
**kwargs: Any,
|
|
165
153
|
) -> ScoresDict:
|
|
166
154
|
raise NotImplementedError(
|