mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +78 -30
- mteb/_evaluators/any_sts_evaluator.py +13 -6
- mteb/_evaluators/clustering_evaluator.py +13 -5
- mteb/_evaluators/evaluator.py +12 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
- mteb/_evaluators/pair_classification_evaluator.py +17 -7
- mteb/_evaluators/retrieval_evaluator.py +23 -14
- mteb/_evaluators/retrieval_metrics.py +26 -19
- mteb/_evaluators/sklearn_evaluator.py +27 -17
- mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
- mteb/_evaluators/text/summarization_evaluator.py +31 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +9 -3
- mteb/abstasks/_data_filter/task_pipelines.py +10 -2
- mteb/abstasks/_statistics_calculation.py +21 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +78 -44
- mteb/abstasks/aggregate_task_metadata.py +21 -18
- mteb/abstasks/aggregated_task.py +23 -35
- mteb/abstasks/classification.py +39 -18
- mteb/abstasks/clustering.py +37 -20
- mteb/abstasks/clustering_legacy.py +30 -16
- mteb/abstasks/image/image_text_pair_classification.py +26 -9
- mteb/abstasks/multilabel_classification.py +33 -21
- mteb/abstasks/pair_classification.py +44 -19
- mteb/abstasks/regression.py +18 -10
- mteb/abstasks/retrieval.py +82 -52
- mteb/abstasks/retrieval_dataset_loaders.py +50 -39
- mteb/abstasks/sts.py +34 -15
- mteb/abstasks/task_metadata.py +44 -37
- mteb/abstasks/text/bitext_mining.py +57 -35
- mteb/abstasks/text/reranking.py +10 -8
- mteb/abstasks/text/summarization.py +26 -10
- mteb/abstasks/zeroshot_classification.py +27 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +15 -3
- mteb/benchmarks/benchmarks/__init__.py +6 -0
- mteb/benchmarks/benchmarks/benchmarks.py +153 -13
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +189 -31
- mteb/cli/_display_tasks.py +10 -4
- mteb/cli/build_cli.py +112 -13
- mteb/cli/generate_model_card.py +50 -23
- mteb/deprecated_evaluator.py +72 -54
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +71 -47
- mteb/filter_tasks.py +36 -32
- mteb/get_tasks.py +37 -33
- mteb/languages/language_scripts.py +11 -4
- mteb/leaderboard/app.py +172 -37
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +20 -14
- mteb/models/abs_encoder.py +30 -16
- mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +16 -11
- mteb/models/get_model_meta.py +53 -9
- mteb/models/instruct_wrapper.py +41 -13
- mteb/models/model_implementations/align_models.py +11 -5
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +6 -4
- mteb/models/model_implementations/ara_models.py +2 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +85 -22
- mteb/models/model_implementations/bica_model.py +4 -3
- mteb/models/model_implementations/blip2_models.py +13 -6
- mteb/models/model_implementations/blip_models.py +33 -20
- mteb/models/model_implementations/bm25.py +27 -17
- mteb/models/model_implementations/bmretriever_models.py +16 -6
- mteb/models/model_implementations/cadet_models.py +2 -1
- mteb/models/model_implementations/cde_models.py +22 -9
- mteb/models/model_implementations/clip_models.py +18 -10
- mteb/models/model_implementations/clips_models.py +6 -3
- mteb/models/model_implementations/codefuse_models.py +10 -5
- mteb/models/model_implementations/codesage_models.py +6 -3
- mteb/models/model_implementations/cohere_models.py +19 -9
- mteb/models/model_implementations/cohere_v.py +16 -6
- mteb/models/model_implementations/colpali_models.py +10 -6
- mteb/models/model_implementations/colqwen_models.py +24 -38
- mteb/models/model_implementations/colsmol_models.py +5 -3
- mteb/models/model_implementations/conan_models.py +12 -5
- mteb/models/model_implementations/dino_models.py +70 -46
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +18 -9
- mteb/models/model_implementations/e5_v.py +16 -10
- mteb/models/model_implementations/eagerworks_models.py +12 -5
- mteb/models/model_implementations/emillykkejensen_models.py +9 -6
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +3 -2
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +18 -9
- mteb/models/model_implementations/facebookai.py +16 -2
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +13 -8
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
- mteb/models/model_implementations/gritlm_models.py +5 -2
- mteb/models/model_implementations/gte_models.py +34 -13
- mteb/models/model_implementations/hinvec_models.py +7 -2
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +16 -7
- mteb/models/model_implementations/jina_clip.py +58 -14
- mteb/models/model_implementations/jina_models.py +35 -16
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +13 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
- mteb/models/model_implementations/kfst.py +2 -1
- mteb/models/model_implementations/kowshik24_models.py +2 -1
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +8 -2
- mteb/models/model_implementations/listconranker.py +11 -5
- mteb/models/model_implementations/llm2clip_models.py +18 -10
- mteb/models/model_implementations/llm2vec_models.py +28 -14
- mteb/models/model_implementations/mcinext_models.py +12 -3
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +131 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
- mteb/models/model_implementations/mme5_models.py +3 -2
- mteb/models/model_implementations/moco_models.py +15 -8
- mteb/models/model_implementations/mod_models.py +3 -2
- mteb/models/model_implementations/model2vec_models.py +37 -18
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +6 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
- mteb/models/model_implementations/nomic_models.py +47 -19
- mteb/models/model_implementations/nomic_models_vision.py +6 -4
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
- mteb/models/model_implementations/nvidia_models.py +165 -22
- mteb/models/model_implementations/octen_models.py +64 -3
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +30 -17
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
- mteb/models/model_implementations/ops_moa_models.py +10 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
- mteb/models/model_implementations/pawan_models.py +2 -1
- mteb/models/model_implementations/piccolo_models.py +3 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +20 -10
- mteb/models/model_implementations/pylate_models.py +41 -21
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +14 -4
- mteb/models/model_implementations/qzhou_models.py +4 -2
- mteb/models/model_implementations/random_baseline.py +7 -6
- mteb/models/model_implementations/rasgaard_models.py +3 -2
- mteb/models/model_implementations/reasonir_model.py +66 -1
- mteb/models/model_implementations/repllama_models.py +18 -9
- mteb/models/model_implementations/rerankers_custom.py +25 -10
- mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +40 -20
- mteb/models/model_implementations/ruri_models.py +20 -10
- mteb/models/model_implementations/salesforce_models.py +13 -4
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +142 -22
- mteb/models/model_implementations/shuu_model.py +2 -1
- mteb/models/model_implementations/siglip_models.py +39 -24
- mteb/models/model_implementations/slm_models.py +419 -0
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +4 -2
- mteb/models/model_implementations/text2vec_models.py +12 -3
- mteb/models/model_implementations/ua_sentence_models.py +2 -1
- mteb/models/model_implementations/uae_models.py +17 -5
- mteb/models/model_implementations/vdr_models.py +9 -2
- mteb/models/model_implementations/vi_vn_models.py +12 -6
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +14 -7
- mteb/models/model_implementations/voyage_models.py +136 -4
- mteb/models/model_implementations/voyage_v.py +17 -10
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +2 -1
- mteb/models/model_implementations/yuan_models_en.py +3 -2
- mteb/models/model_meta.py +127 -40
- mteb/models/models_protocols.py +43 -22
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
- mteb/models/search_wrappers.py +63 -29
- mteb/models/sentence_transformer_wrapper.py +52 -26
- mteb/models/vllm_wrapper.py +329 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +48 -35
- mteb/results/model_result.py +68 -32
- mteb/results/task_result.py +110 -72
- mteb/similarity_functions.py +19 -9
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +2 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +16 -16
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +44 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +3 -3
- mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +13 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +18 -5
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
mteb/filter_tasks.py
CHANGED
|
@@ -1,19 +1,24 @@
|
|
|
1
1
|
"""This script contains functions that are used to get an overview of the MTEB benchmark."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import logging
|
|
4
|
-
from
|
|
5
|
-
from typing import overload
|
|
6
|
+
from typing import TYPE_CHECKING, overload
|
|
6
7
|
|
|
7
|
-
from mteb.abstasks import (
|
|
8
|
-
AbsTask,
|
|
9
|
-
)
|
|
10
8
|
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
11
|
-
from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
|
|
12
9
|
from mteb.languages import (
|
|
13
10
|
ISO_TO_LANGUAGE,
|
|
14
11
|
ISO_TO_SCRIPT,
|
|
15
12
|
)
|
|
16
|
-
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from collections.abc import Iterable, Sequence
|
|
16
|
+
|
|
17
|
+
from mteb.abstasks import (
|
|
18
|
+
AbsTask,
|
|
19
|
+
)
|
|
20
|
+
from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
|
|
21
|
+
from mteb.types import Modalities
|
|
17
22
|
|
|
18
23
|
logger = logging.getLogger(__name__)
|
|
19
24
|
|
|
@@ -34,14 +39,14 @@ def _check_is_valid_language(lang: str) -> None:
|
|
|
34
39
|
|
|
35
40
|
@overload
|
|
36
41
|
def filter_tasks(
|
|
37
|
-
tasks:
|
|
42
|
+
tasks: Iterable[AbsTask],
|
|
38
43
|
*,
|
|
39
|
-
languages:
|
|
40
|
-
script:
|
|
41
|
-
domains:
|
|
42
|
-
task_types:
|
|
43
|
-
categories:
|
|
44
|
-
modalities:
|
|
44
|
+
languages: Sequence[str] | None = None,
|
|
45
|
+
script: Sequence[str] | None = None,
|
|
46
|
+
domains: Iterable[TaskDomain] | None = None,
|
|
47
|
+
task_types: Iterable[TaskType] | None = None,
|
|
48
|
+
categories: Iterable[TaskCategory] | None = None,
|
|
49
|
+
modalities: Iterable[Modalities] | None = None,
|
|
45
50
|
exclusive_modality_filter: bool = False,
|
|
46
51
|
exclude_superseded: bool = False,
|
|
47
52
|
exclude_aggregate: bool = False,
|
|
@@ -51,14 +56,14 @@ def filter_tasks(
|
|
|
51
56
|
|
|
52
57
|
@overload
|
|
53
58
|
def filter_tasks(
|
|
54
|
-
tasks:
|
|
59
|
+
tasks: Iterable[type[AbsTask]],
|
|
55
60
|
*,
|
|
56
|
-
languages:
|
|
57
|
-
script:
|
|
58
|
-
domains:
|
|
59
|
-
task_types:
|
|
60
|
-
categories:
|
|
61
|
-
modalities:
|
|
61
|
+
languages: Sequence[str] | None = None,
|
|
62
|
+
script: Sequence[str] | None = None,
|
|
63
|
+
domains: Iterable[TaskDomain] | None = None,
|
|
64
|
+
task_types: Iterable[TaskType] | None = None,
|
|
65
|
+
categories: Iterable[TaskCategory] | None = None,
|
|
66
|
+
modalities: Iterable[Modalities] | None = None,
|
|
62
67
|
exclusive_modality_filter: bool = False,
|
|
63
68
|
exclude_superseded: bool = False,
|
|
64
69
|
exclude_aggregate: bool = False,
|
|
@@ -67,14 +72,14 @@ def filter_tasks(
|
|
|
67
72
|
|
|
68
73
|
|
|
69
74
|
def filter_tasks(
|
|
70
|
-
tasks:
|
|
75
|
+
tasks: Iterable[AbsTask] | Iterable[type[AbsTask]],
|
|
71
76
|
*,
|
|
72
|
-
languages:
|
|
73
|
-
script:
|
|
74
|
-
domains:
|
|
75
|
-
task_types:
|
|
76
|
-
categories:
|
|
77
|
-
modalities:
|
|
77
|
+
languages: Sequence[str] | None = None,
|
|
78
|
+
script: Sequence[str] | None = None,
|
|
79
|
+
domains: Iterable[TaskDomain] | None = None,
|
|
80
|
+
task_types: Iterable[TaskType] | None = None,
|
|
81
|
+
categories: Iterable[TaskCategory] | None = None,
|
|
82
|
+
modalities: Iterable[Modalities] | None = None,
|
|
78
83
|
exclusive_modality_filter: bool = False,
|
|
79
84
|
exclude_superseded: bool = False,
|
|
80
85
|
exclude_aggregate: bool = False,
|
|
@@ -92,7 +97,6 @@ def filter_tasks(
|
|
|
92
97
|
task_types: A string specifying the type of task e.g. "Classification" or "Retrieval". If None, all tasks are included.
|
|
93
98
|
categories: A list of task categories these include "t2t" (text to text), "t2i" (text to image). See TaskMetadata for the full list.
|
|
94
99
|
exclude_superseded: A boolean flag to exclude datasets which are superseded by another.
|
|
95
|
-
eval_splits: A list of evaluation splits to include. If None, all splits are included.
|
|
96
100
|
modalities: A list of modalities to include. If None, all modalities are included.
|
|
97
101
|
exclusive_modality_filter: If True, only keep tasks where _all_ filter modalities are included in the
|
|
98
102
|
task's modalities and ALL task modalities are in filter modalities (exact match).
|
|
@@ -113,12 +117,12 @@ def filter_tasks(
|
|
|
113
117
|
"""
|
|
114
118
|
langs_to_keep = None
|
|
115
119
|
if languages:
|
|
116
|
-
[_check_is_valid_language(lang) for lang in languages]
|
|
120
|
+
[_check_is_valid_language(lang) for lang in languages] # type: ignore[func-returns-value]
|
|
117
121
|
langs_to_keep = set(languages)
|
|
118
122
|
|
|
119
123
|
script_to_keep = None
|
|
120
124
|
if script:
|
|
121
|
-
[_check_is_valid_script(s) for s in script]
|
|
125
|
+
[_check_is_valid_script(s) for s in script] # type: ignore[func-returns-value]
|
|
122
126
|
script_to_keep = set(script)
|
|
123
127
|
|
|
124
128
|
domains_to_keep = None
|
|
@@ -178,4 +182,4 @@ def filter_tasks(
|
|
|
178
182
|
|
|
179
183
|
_tasks.append(t)
|
|
180
184
|
|
|
181
|
-
return _tasks
|
|
185
|
+
return _tasks # type: ignore[return-value] # type checker cannot infer the overload return type
|
mteb/get_tasks.py
CHANGED
|
@@ -1,19 +1,25 @@
|
|
|
1
1
|
"""This script contains functions that are used to get an overview of the MTEB benchmark."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import difflib
|
|
4
6
|
import logging
|
|
7
|
+
import warnings
|
|
5
8
|
from collections import Counter, defaultdict
|
|
6
|
-
from
|
|
7
|
-
from typing import Any
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
8
10
|
|
|
9
11
|
import pandas as pd
|
|
10
12
|
|
|
11
13
|
from mteb.abstasks import (
|
|
12
14
|
AbsTask,
|
|
13
15
|
)
|
|
14
|
-
from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
|
|
15
16
|
from mteb.filter_tasks import filter_tasks
|
|
16
|
-
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import Iterable, Sequence
|
|
20
|
+
|
|
21
|
+
from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
|
|
22
|
+
from mteb.types import Modalities
|
|
17
23
|
|
|
18
24
|
logger = logging.getLogger(__name__)
|
|
19
25
|
|
|
@@ -22,12 +28,11 @@ logger = logging.getLogger(__name__)
|
|
|
22
28
|
def _gather_tasks() -> tuple[type[AbsTask], ...]:
|
|
23
29
|
import mteb.tasks as tasks
|
|
24
30
|
|
|
25
|
-
|
|
31
|
+
return tuple(
|
|
26
32
|
t
|
|
27
33
|
for t in tasks.__dict__.values()
|
|
28
34
|
if isinstance(t, type) and issubclass(t, AbsTask)
|
|
29
|
-
|
|
30
|
-
return tuple(tasks)
|
|
35
|
+
)
|
|
31
36
|
|
|
32
37
|
|
|
33
38
|
def _create_name_to_task_mapping(
|
|
@@ -43,7 +48,7 @@ def _create_name_to_task_mapping(
|
|
|
43
48
|
return metadata_names
|
|
44
49
|
|
|
45
50
|
|
|
46
|
-
def _create_similar_tasks(tasks:
|
|
51
|
+
def _create_similar_tasks(tasks: Iterable[type[AbsTask]]) -> dict[str, list[str]]:
|
|
47
52
|
"""Create a dictionary of similar tasks.
|
|
48
53
|
|
|
49
54
|
Returns:
|
|
@@ -194,9 +199,8 @@ class MTEBTasks(tuple[AbsTask]):
|
|
|
194
199
|
string with a LaTeX table.
|
|
195
200
|
"""
|
|
196
201
|
if include_citation_in_name and "name" in properties:
|
|
197
|
-
properties
|
|
198
|
-
df =
|
|
199
|
-
df["name"] = df["name"] + " " + df["intext_citation"]
|
|
202
|
+
df = self.to_dataframe(tuple(properties) + ("intext_citation",))
|
|
203
|
+
df["name"] = df["name"] + " " + df["intext_citation"] # type: ignore[operator]
|
|
200
204
|
df = df.drop(columns=["intext_citation"])
|
|
201
205
|
else:
|
|
202
206
|
df = self.to_dataframe(properties)
|
|
@@ -221,17 +225,17 @@ class MTEBTasks(tuple[AbsTask]):
|
|
|
221
225
|
|
|
222
226
|
|
|
223
227
|
def get_tasks(
|
|
224
|
-
tasks:
|
|
228
|
+
tasks: Sequence[str] | None = None,
|
|
225
229
|
*,
|
|
226
|
-
languages:
|
|
227
|
-
script:
|
|
228
|
-
domains:
|
|
229
|
-
task_types:
|
|
230
|
-
categories:
|
|
230
|
+
languages: Sequence[str] | None = None,
|
|
231
|
+
script: Sequence[str] | None = None,
|
|
232
|
+
domains: Sequence[TaskDomain] | None = None,
|
|
233
|
+
task_types: Sequence[TaskType] | None = None,
|
|
234
|
+
categories: Sequence[TaskCategory] | None = None,
|
|
231
235
|
exclude_superseded: bool = True,
|
|
232
|
-
eval_splits:
|
|
236
|
+
eval_splits: Sequence[str] | None = None,
|
|
233
237
|
exclusive_language_filter: bool = False,
|
|
234
|
-
modalities:
|
|
238
|
+
modalities: Sequence[Modalities] | None = None,
|
|
235
239
|
exclusive_modality_filter: bool = False,
|
|
236
240
|
exclude_aggregate: bool = False,
|
|
237
241
|
exclude_private: bool = True,
|
|
@@ -287,7 +291,7 @@ def get_tasks(
|
|
|
287
291
|
]
|
|
288
292
|
return MTEBTasks(_tasks)
|
|
289
293
|
|
|
290
|
-
|
|
294
|
+
tasks_: Sequence[type[AbsTask]] = filter_tasks(
|
|
291
295
|
TASK_LIST,
|
|
292
296
|
languages=languages,
|
|
293
297
|
script=script,
|
|
@@ -300,12 +304,12 @@ def get_tasks(
|
|
|
300
304
|
exclude_aggregate=exclude_aggregate,
|
|
301
305
|
exclude_private=exclude_private,
|
|
302
306
|
)
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
307
|
+
return MTEBTasks(
|
|
308
|
+
[
|
|
309
|
+
cls().filter_languages(languages, script).filter_eval_splits(eval_splits)
|
|
310
|
+
for cls in tasks_
|
|
311
|
+
]
|
|
312
|
+
)
|
|
309
313
|
|
|
310
314
|
|
|
311
315
|
_TASK_RENAMES = {"PersianTextTone": "SynPerTextToneClassification"}
|
|
@@ -313,10 +317,10 @@ _TASK_RENAMES = {"PersianTextTone": "SynPerTextToneClassification"}
|
|
|
313
317
|
|
|
314
318
|
def get_task(
|
|
315
319
|
task_name: str,
|
|
316
|
-
languages:
|
|
317
|
-
script:
|
|
318
|
-
eval_splits:
|
|
319
|
-
hf_subsets:
|
|
320
|
+
languages: Sequence[str] | None = None,
|
|
321
|
+
script: Sequence[str] | None = None,
|
|
322
|
+
eval_splits: Sequence[str] | None = None,
|
|
323
|
+
hf_subsets: Sequence[str] | None = None,
|
|
320
324
|
exclusive_language_filter: bool = False,
|
|
321
325
|
) -> AbsTask:
|
|
322
326
|
"""Get a task by name.
|
|
@@ -340,9 +344,9 @@ def get_task(
|
|
|
340
344
|
"""
|
|
341
345
|
if task_name in _TASK_RENAMES:
|
|
342
346
|
_task_name = _TASK_RENAMES[task_name]
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
)
|
|
347
|
+
msg = f"The task with the given name '{task_name}' has been renamed to '{_task_name}'. To prevent this warning use the new name."
|
|
348
|
+
logger.warning(msg)
|
|
349
|
+
warnings.warn(msg)
|
|
346
350
|
|
|
347
351
|
if task_name not in _TASKS_REGISTRY:
|
|
348
352
|
close_matches = difflib.get_close_matches(task_name, _TASKS_REGISTRY.keys())
|
|
@@ -1,9 +1,14 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
2
3
|
from dataclasses import dataclass
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
from mteb.languages.check_language_code import check_language_code
|
|
3
7
|
|
|
4
|
-
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from collections.abc import Iterable, Sequence
|
|
5
10
|
|
|
6
|
-
from
|
|
11
|
+
from typing_extensions import Self
|
|
7
12
|
|
|
8
13
|
|
|
9
14
|
@dataclass
|
|
@@ -25,7 +30,9 @@ class LanguageScripts:
|
|
|
25
30
|
|
|
26
31
|
@classmethod
|
|
27
32
|
def from_languages_and_scripts(
|
|
28
|
-
cls,
|
|
33
|
+
cls,
|
|
34
|
+
languages: Sequence[str] | None = None,
|
|
35
|
+
scripts: Sequence[str] | None = None,
|
|
29
36
|
) -> Self:
|
|
30
37
|
"""Create a LanguageScripts object from lists of languages and scripts.
|
|
31
38
|
|
mteb/leaderboard/app.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import itertools
|
|
2
4
|
import json
|
|
3
5
|
import logging
|
|
@@ -5,15 +7,14 @@ import tempfile
|
|
|
5
7
|
import time
|
|
6
8
|
import warnings
|
|
7
9
|
from pathlib import Path
|
|
8
|
-
from typing import Literal
|
|
10
|
+
from typing import TYPE_CHECKING, Literal, get_args
|
|
9
11
|
from urllib.parse import urlencode
|
|
10
12
|
|
|
11
13
|
import cachetools
|
|
12
14
|
import gradio as gr
|
|
13
|
-
import pandas as pd
|
|
15
|
+
import pandas as pd # noqa: TC002 # gradio tries to validate typehints
|
|
14
16
|
|
|
15
17
|
import mteb
|
|
16
|
-
from mteb import BenchmarkResults
|
|
17
18
|
from mteb.benchmarks.benchmark import RtebBenchmark
|
|
18
19
|
from mteb.cache import ResultCache
|
|
19
20
|
from mteb.leaderboard.benchmark_selector import (
|
|
@@ -29,40 +30,118 @@ from mteb.leaderboard.table import (
|
|
|
29
30
|
apply_summary_styling_from_benchmark,
|
|
30
31
|
)
|
|
31
32
|
from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
|
|
33
|
+
from mteb.models.model_meta import MODEL_TYPES
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
from mteb import BenchmarkResults
|
|
32
37
|
|
|
33
38
|
logger = logging.getLogger(__name__)
|
|
34
39
|
|
|
40
|
+
|
|
35
41
|
LANGUAGE: list[str] = list({l for t in mteb.get_tasks() for l in t.metadata.languages})
|
|
42
|
+
MODEL_TYPE_CHOICES = list(get_args(MODEL_TYPES))
|
|
36
43
|
|
|
37
44
|
|
|
38
45
|
def _load_results(cache: ResultCache) -> BenchmarkResults:
|
|
46
|
+
"""Load benchmark results using an optimized caching strategy.
|
|
47
|
+
|
|
48
|
+
This function implements a two-tier caching strategy for faster leaderboard startup:
|
|
49
|
+
|
|
50
|
+
1. **Primary Strategy (Fast)**: Download pre-computed cached results from the
|
|
51
|
+
'cached-data' branch as a compressed JSON file (~2MB vs ~200MB full repo).
|
|
52
|
+
This avoids the need to clone the entire results repository and provides
|
|
53
|
+
near-instantaneous loading for most users.
|
|
54
|
+
|
|
55
|
+
2. **Fallback Strategy (Slower)**: If the cached download fails, fall back to
|
|
56
|
+
the original approach of downloading the full results repository and
|
|
57
|
+
building the cache from scratch.
|
|
58
|
+
|
|
59
|
+
The cached results file contains pre-aggregated benchmark data that eliminates
|
|
60
|
+
the need for expensive operations like task selection and revision joining
|
|
61
|
+
during app startup.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
cache: ResultCache instance used for both optimized and fallback operations
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
BenchmarkResults: Complete benchmark results ready for leaderboard display
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
Various exceptions related to network issues, file I/O, or data validation
|
|
71
|
+
are logged and may cause fallback to the slower repository-based approach.
|
|
72
|
+
"""
|
|
39
73
|
start_time = time.time()
|
|
40
74
|
results_cache_path = Path(__file__).parent.joinpath("__cached_results.json")
|
|
75
|
+
|
|
41
76
|
if not results_cache_path.exists():
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
load_start = time.time()
|
|
48
|
-
all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
|
|
49
|
-
|
|
50
|
-
all_results = cache.load_results(
|
|
51
|
-
models=all_model_names,
|
|
52
|
-
only_main_score=True,
|
|
53
|
-
require_model_meta=False,
|
|
54
|
-
include_remote=True,
|
|
77
|
+
# First try to download the cached results file from the cached-data branch
|
|
78
|
+
# This is faster than cloning the entire results repository
|
|
79
|
+
logger.info(
|
|
80
|
+
"Cached results not found, trying to download from cached-data branch..."
|
|
55
81
|
)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
# Use ResultCache's optimized download method
|
|
85
|
+
# Default saves to mteb/leaderboard/__cached_results.json
|
|
86
|
+
results_cache_path = cache._download_cached_results_from_branch()
|
|
87
|
+
download_time = time.time() - start_time
|
|
88
|
+
logger.info(
|
|
89
|
+
f"Downloaded cached results from cached-data branch in {download_time:.2f}s"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
except Exception as e:
|
|
93
|
+
logger.error(
|
|
94
|
+
f"Failed to download from cached-data branch: {type(e).__name__}: {e}"
|
|
95
|
+
)
|
|
96
|
+
logger.info("Falling back to downloading full remote repository...")
|
|
97
|
+
|
|
98
|
+
# Fall back to the original approach: clone the full repo
|
|
99
|
+
cache.download_from_remote()
|
|
100
|
+
download_time = time.time() - start_time
|
|
101
|
+
logger.info(f"Downloaded remote results in {download_time:.2f}s")
|
|
102
|
+
|
|
103
|
+
load_start = time.time()
|
|
104
|
+
all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
|
|
105
|
+
|
|
106
|
+
all_results = cache.load_results(
|
|
107
|
+
models=all_model_names,
|
|
108
|
+
only_main_score=True,
|
|
109
|
+
require_model_meta=False,
|
|
110
|
+
include_remote=True,
|
|
111
|
+
)
|
|
112
|
+
load_time = time.time() - load_start
|
|
113
|
+
logger.info(f"Loaded results from cache in {load_time:.2f}s")
|
|
114
|
+
return all_results
|
|
115
|
+
|
|
116
|
+
# Load the cached results file (either pre-existing or just downloaded)
|
|
117
|
+
logger.info("Loading cached results from disk...")
|
|
118
|
+
try:
|
|
119
|
+
logger.info(f"Opening file: {results_cache_path}")
|
|
120
|
+
|
|
121
|
+
file_size = results_cache_path.stat().st_size
|
|
122
|
+
logger.info(f"File exists, size: {file_size} bytes")
|
|
123
|
+
|
|
61
124
|
with results_cache_path.open() as cache_file:
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
125
|
+
logger.info("File opened successfully, attempting JSON parse...")
|
|
126
|
+
json_data = json.load(cache_file)
|
|
127
|
+
logger.info(
|
|
128
|
+
f"JSON parsed successfully, keys: {list(json_data.keys()) if isinstance(json_data, dict) else 'not a dict'}"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
logger.info("Attempting BenchmarkResults.from_validated...")
|
|
132
|
+
results = mteb.BenchmarkResults.from_validated(**json_data)
|
|
133
|
+
logger.info("BenchmarkResults.from_validated successful")
|
|
134
|
+
|
|
135
|
+
except Exception as e:
|
|
136
|
+
# TODO: Handle the case when we fail to load cached results from disk.
|
|
137
|
+
logger.error(
|
|
138
|
+
f"Failed to load cached results from disk: {type(e).__name__}: {e}"
|
|
139
|
+
)
|
|
140
|
+
raise
|
|
141
|
+
|
|
142
|
+
total_time = time.time() - start_time
|
|
143
|
+
logger.info(f"Loaded cached results in {total_time:.2f}s")
|
|
144
|
+
return results
|
|
66
145
|
|
|
67
146
|
|
|
68
147
|
def _produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str:
|
|
@@ -169,7 +248,7 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
|
|
|
169
248
|
df = df.drop(columns="reference")
|
|
170
249
|
return gr.DataFrame(
|
|
171
250
|
df,
|
|
172
|
-
datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
|
|
251
|
+
datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
|
|
173
252
|
buttons=["copy", "fullscreen"],
|
|
174
253
|
show_search="filter",
|
|
175
254
|
)
|
|
@@ -187,6 +266,7 @@ def _filter_models(
|
|
|
187
266
|
instructions: bool | None,
|
|
188
267
|
max_model_size: int,
|
|
189
268
|
zero_shot_setting: Literal["only_zero_shot", "allow_all", "remove_unknown"],
|
|
269
|
+
model_types: list[str] | None,
|
|
190
270
|
):
|
|
191
271
|
lower, upper = 0, max_model_size
|
|
192
272
|
# Setting to None, when the user doesn't specify anything
|
|
@@ -205,6 +285,7 @@ def _filter_models(
|
|
|
205
285
|
use_instructions=instructions,
|
|
206
286
|
frameworks=compatibility,
|
|
207
287
|
n_parameters_range=(lower, upper),
|
|
288
|
+
model_types=model_types,
|
|
208
289
|
)
|
|
209
290
|
|
|
210
291
|
models_to_keep = set()
|
|
@@ -269,6 +350,7 @@ def _cache_on_benchmark_select(benchmark_name, all_benchmark_results):
|
|
|
269
350
|
instructions=None,
|
|
270
351
|
max_model_size=MAX_MODEL_SIZE,
|
|
271
352
|
zero_shot_setting="allow_all",
|
|
353
|
+
model_types=MODEL_TYPE_CHOICES,
|
|
272
354
|
)
|
|
273
355
|
# Sort to ensure consistency with update_models
|
|
274
356
|
initial_models = sorted(initial_models)
|
|
@@ -387,6 +469,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
387
469
|
instructions=None,
|
|
388
470
|
max_model_size=MAX_MODEL_SIZE,
|
|
389
471
|
zero_shot_setting="allow_all",
|
|
472
|
+
model_types=MODEL_TYPE_CHOICES,
|
|
390
473
|
)
|
|
391
474
|
default_filtered_scores = [
|
|
392
475
|
entry for entry in default_scores if entry["model_name"] in filtered_models
|
|
@@ -467,7 +550,10 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
467
550
|
|
|
468
551
|
logger.info("Step 7/7: Building Gradio interface and callbacks...")
|
|
469
552
|
interface_start = time.time()
|
|
470
|
-
with gr.Blocks(
|
|
553
|
+
with gr.Blocks(
|
|
554
|
+
title="MTEB Leaderboard",
|
|
555
|
+
fill_width=True,
|
|
556
|
+
) as demo:
|
|
471
557
|
with gr.Sidebar(
|
|
472
558
|
position="left",
|
|
473
559
|
label="Benchmark Selection and Customization",
|
|
@@ -583,6 +669,12 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
583
669
|
label="Model Parameters",
|
|
584
670
|
interactive=True,
|
|
585
671
|
)
|
|
672
|
+
with gr.Column():
|
|
673
|
+
model_type_select = gr.CheckboxGroup(
|
|
674
|
+
MODEL_TYPE_CHOICES,
|
|
675
|
+
value=MODEL_TYPE_CHOICES,
|
|
676
|
+
label="Model Type",
|
|
677
|
+
)
|
|
586
678
|
|
|
587
679
|
with gr.Tab("Summary"):
|
|
588
680
|
summary_table.render()
|
|
@@ -755,7 +847,8 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
755
847
|
compatibility,
|
|
756
848
|
instructions,
|
|
757
849
|
max_model_size,
|
|
758
|
-
zero_shot
|
|
850
|
+
zero_shot,
|
|
851
|
+
model_type_select: hash(
|
|
759
852
|
(
|
|
760
853
|
id(scores),
|
|
761
854
|
hash(tuple(tasks)),
|
|
@@ -764,6 +857,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
764
857
|
hash(instructions),
|
|
765
858
|
hash(max_model_size),
|
|
766
859
|
hash(zero_shot),
|
|
860
|
+
hash(tuple(model_type_select)),
|
|
767
861
|
)
|
|
768
862
|
),
|
|
769
863
|
)
|
|
@@ -775,6 +869,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
775
869
|
instructions: bool | None,
|
|
776
870
|
max_model_size: int,
|
|
777
871
|
zero_shot: Literal["allow_all", "remove_unknown", "only_zero_shot"],
|
|
872
|
+
model_type_select: list[str],
|
|
778
873
|
):
|
|
779
874
|
start_time = time.time()
|
|
780
875
|
model_names = list({entry["model_name"] for entry in scores})
|
|
@@ -786,6 +881,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
786
881
|
instructions,
|
|
787
882
|
max_model_size,
|
|
788
883
|
zero_shot_setting=zero_shot,
|
|
884
|
+
model_types=model_type_select,
|
|
789
885
|
)
|
|
790
886
|
elapsed = time.time() - start_time
|
|
791
887
|
logger.debug(f"update_models callback: {elapsed}s")
|
|
@@ -803,6 +899,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
803
899
|
instructions,
|
|
804
900
|
max_model_size,
|
|
805
901
|
zero_shot,
|
|
902
|
+
model_type_select,
|
|
806
903
|
],
|
|
807
904
|
outputs=[models],
|
|
808
905
|
)
|
|
@@ -817,6 +914,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
817
914
|
instructions,
|
|
818
915
|
max_model_size,
|
|
819
916
|
zero_shot,
|
|
917
|
+
model_type_select,
|
|
820
918
|
],
|
|
821
919
|
outputs=[models],
|
|
822
920
|
)
|
|
@@ -830,6 +928,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
830
928
|
instructions,
|
|
831
929
|
max_model_size,
|
|
832
930
|
zero_shot,
|
|
931
|
+
model_type_select,
|
|
833
932
|
],
|
|
834
933
|
outputs=[models],
|
|
835
934
|
)
|
|
@@ -843,6 +942,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
843
942
|
instructions,
|
|
844
943
|
max_model_size,
|
|
845
944
|
zero_shot,
|
|
945
|
+
model_type_select,
|
|
846
946
|
],
|
|
847
947
|
outputs=[models],
|
|
848
948
|
)
|
|
@@ -856,6 +956,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
856
956
|
instructions,
|
|
857
957
|
max_model_size,
|
|
858
958
|
zero_shot,
|
|
959
|
+
model_type_select,
|
|
859
960
|
],
|
|
860
961
|
outputs=[models],
|
|
861
962
|
)
|
|
@@ -869,6 +970,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
869
970
|
instructions,
|
|
870
971
|
max_model_size,
|
|
871
972
|
zero_shot,
|
|
973
|
+
model_type_select,
|
|
872
974
|
],
|
|
873
975
|
outputs=[models],
|
|
874
976
|
)
|
|
@@ -882,6 +984,21 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
882
984
|
instructions,
|
|
883
985
|
max_model_size,
|
|
884
986
|
zero_shot,
|
|
987
|
+
model_type_select,
|
|
988
|
+
],
|
|
989
|
+
outputs=[models],
|
|
990
|
+
)
|
|
991
|
+
model_type_select.change(
|
|
992
|
+
update_models,
|
|
993
|
+
inputs=[
|
|
994
|
+
scores,
|
|
995
|
+
task_select,
|
|
996
|
+
availability,
|
|
997
|
+
compatibility,
|
|
998
|
+
instructions,
|
|
999
|
+
max_model_size,
|
|
1000
|
+
zero_shot,
|
|
1001
|
+
model_type_select,
|
|
885
1002
|
],
|
|
886
1003
|
outputs=[models],
|
|
887
1004
|
)
|
|
@@ -1023,16 +1140,34 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
1023
1140
|
|
|
1024
1141
|
|
|
1025
1142
|
if __name__ == "__main__":
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
logging.
|
|
1030
|
-
logging.
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1143
|
+
import os
|
|
1144
|
+
|
|
1145
|
+
# Add process ID to logging for multiprocessing debugging
|
|
1146
|
+
logging.basicConfig(
|
|
1147
|
+
level=logging.INFO,
|
|
1148
|
+
format="%(asctime)s - PID:%(process)d - %(name)s - %(levelname)s - %(message)s",
|
|
1149
|
+
force=True, # Override any existing handlers
|
|
1150
|
+
)
|
|
1151
|
+
|
|
1152
|
+
# Flush log handlers immediately (helpful for multiprocessing)
|
|
1153
|
+
for handler in logging.root.handlers:
|
|
1154
|
+
handler.flush()
|
|
1155
|
+
|
|
1156
|
+
logger.info(f"Starting leaderboard app in process {os.getpid()}")
|
|
1157
|
+
|
|
1158
|
+
# Suppress specific WARNING messages while keeping INFO level for the app
|
|
1159
|
+
logging.getLogger("mteb.results.task_result").setLevel(logging.ERROR)
|
|
1160
|
+
logging.getLogger("mteb.models.model_meta").setLevel(logging.ERROR)
|
|
1161
|
+
logging.getLogger("mteb.results.benchmark_results").setLevel(logging.ERROR)
|
|
1162
|
+
|
|
1035
1163
|
warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*")
|
|
1164
|
+
warnings.filterwarnings("ignore", message="Could not get source model: .*")
|
|
1165
|
+
warnings.filterwarnings(
|
|
1166
|
+
"ignore", message="No scores data available. Returning empty DataFrame."
|
|
1167
|
+
)
|
|
1168
|
+
warnings.filterwarnings("ignore", message="Main score .* not found in scores")
|
|
1169
|
+
warnings.filterwarnings("ignore", message=".*: Missing subsets .* for split .*")
|
|
1170
|
+
warnings.filterwarnings("ignore", message=".*: Missing splits .*")
|
|
1036
1171
|
|
|
1037
1172
|
app = get_leaderboard_app()
|
|
1038
1173
|
|