mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +6 -0
- mteb/_create_dataloaders.py +22 -20
- mteb/_evaluators/any_sts_evaluator.py +23 -14
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +3 -3
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
- mteb/_evaluators/pair_classification_evaluator.py +34 -40
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +25 -37
- mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
- mteb/_evaluators/text/summarization_evaluator.py +27 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +105 -0
- mteb/abstasks/_statistics_calculation.py +23 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -12
- mteb/abstasks/clustering.py +20 -16
- mteb/abstasks/clustering_legacy.py +13 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +33 -22
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +14 -4
- mteb/abstasks/task_metadata.py +32 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +77 -16
- mteb/benchmarks/benchmarks/__init__.py +12 -0
- mteb/benchmarks/benchmarks/benchmarks.py +361 -16
- mteb/benchmarks/get_benchmark.py +14 -53
- mteb/cache.py +227 -37
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +71 -62
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +106 -75
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +414 -151
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/load_results.py +12 -12
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +31 -23
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +3 -3
- mteb/models/get_model_meta.py +25 -118
- mteb/models/instruct_wrapper.py +33 -9
- mteb/models/model_implementations/align_models.py +8 -1
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +9 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +101 -17
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +13 -2
- mteb/models/model_implementations/blip_models.py +43 -16
- mteb/models/model_implementations/bm25.py +5 -4
- mteb/models/model_implementations/bmretriever_models.py +10 -4
- mteb/models/model_implementations/cadet_models.py +10 -1
- mteb/models/model_implementations/cde_models.py +25 -4
- mteb/models/model_implementations/clip_models.py +9 -6
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +165 -3
- mteb/models/model_implementations/codesage_models.py +18 -3
- mteb/models/model_implementations/cohere_models.py +13 -6
- mteb/models/model_implementations/cohere_v.py +7 -2
- mteb/models/model_implementations/colpali_models.py +17 -9
- mteb/models/model_implementations/colqwen_models.py +275 -5
- mteb/models/model_implementations/colsmol_models.py +4 -2
- mteb/models/model_implementations/conan_models.py +2 -1
- mteb/models/model_implementations/dino_models.py +194 -23
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +21 -110
- mteb/models/model_implementations/e5_v.py +7 -6
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +67 -9
- mteb/models/model_implementations/facebookai.py +205 -0
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +17 -10
- mteb/models/model_implementations/google_models.py +17 -6
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
- mteb/models/model_implementations/gritlm_models.py +4 -2
- mteb/models/model_implementations/gte_models.py +99 -9
- mteb/models/model_implementations/hinvec_models.py +2 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +256 -3
- mteb/models/model_implementations/jina_clip.py +49 -10
- mteb/models/model_implementations/jina_models.py +222 -11
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +37 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +4 -3
- mteb/models/model_implementations/listconranker.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +9 -6
- mteb/models/model_implementations/llm2vec_models.py +16 -8
- mteb/models/model_implementations/mcinext_models.py +7 -1
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +422 -60
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +15 -4
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +27 -14
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
- mteb/models/model_implementations/nomic_models.py +173 -6
- mteb/models/model_implementations/nomic_models_vision.py +8 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
- mteb/models/model_implementations/nvidia_models.py +155 -20
- mteb/models/model_implementations/octen_models.py +254 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +37 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
- mteb/models/model_implementations/ops_moa_models.py +5 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +9 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +12 -8
- mteb/models/model_implementations/pylate_models.py +46 -12
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +9 -6
- mteb/models/model_implementations/qzhou_models.py +5 -3
- mteb/models/model_implementations/random_baseline.py +19 -24
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +2 -1
- mteb/models/model_implementations/repllama_models.py +5 -3
- mteb/models/model_implementations/rerankers_custom.py +15 -9
- mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +71 -20
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +6 -3
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +177 -18
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +30 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +11 -1
- mteb/models/model_implementations/uae_models.py +8 -1
- mteb/models/model_implementations/vdr_models.py +3 -1
- mteb/models/model_implementations/vi_vn_models.py +45 -6
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +5 -3
- mteb/models/model_implementations/voyage_models.py +99 -0
- mteb/models/model_implementations/voyage_v.py +17 -9
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +498 -29
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
- mteb/models/search_wrappers.py +197 -65
- mteb/models/sentence_transformer_wrapper.py +52 -32
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +114 -65
- mteb/results/model_result.py +63 -26
- mteb/results/task_result.py +117 -77
- mteb/similarity_functions.py +60 -7
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -3
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +2 -3
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +16 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +24 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +19 -2
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
- mteb/models/model_implementations/mxbai_models.py +0 -102
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
mteb/leaderboard/app.py
CHANGED
|
@@ -14,7 +14,6 @@ import pandas as pd
|
|
|
14
14
|
|
|
15
15
|
import mteb
|
|
16
16
|
from mteb import BenchmarkResults
|
|
17
|
-
from mteb.abstasks.task_metadata import TaskDomain, TaskType
|
|
18
17
|
from mteb.benchmarks.benchmark import RtebBenchmark
|
|
19
18
|
from mteb.cache import ResultCache
|
|
20
19
|
from mteb.leaderboard.benchmark_selector import (
|
|
@@ -25,33 +24,120 @@ from mteb.leaderboard.benchmark_selector import (
|
|
|
25
24
|
)
|
|
26
25
|
from mteb.leaderboard.figures import _performance_size_plot, _radar_chart
|
|
27
26
|
from mteb.leaderboard.table import (
|
|
27
|
+
apply_per_language_styling_from_benchmark,
|
|
28
28
|
apply_per_task_styling_from_benchmark,
|
|
29
29
|
apply_summary_styling_from_benchmark,
|
|
30
30
|
)
|
|
31
31
|
from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
|
|
32
|
-
from mteb.
|
|
32
|
+
from mteb.models.model_meta import MODEL_TYPES
|
|
33
33
|
|
|
34
34
|
logger = logging.getLogger(__name__)
|
|
35
35
|
|
|
36
|
+
|
|
36
37
|
LANGUAGE: list[str] = list({l for t in mteb.get_tasks() for l in t.metadata.languages})
|
|
38
|
+
MODEL_TYPE_CHOICES = list(get_args(MODEL_TYPES))
|
|
37
39
|
|
|
38
40
|
|
|
39
41
|
def _load_results(cache: ResultCache) -> BenchmarkResults:
|
|
42
|
+
"""Load benchmark results using an optimized caching strategy.
|
|
43
|
+
|
|
44
|
+
This function implements a two-tier caching strategy for faster leaderboard startup:
|
|
45
|
+
|
|
46
|
+
1. **Primary Strategy (Fast)**: Download pre-computed cached results from the
|
|
47
|
+
'cached-data' branch as a compressed JSON file (~2MB vs ~200MB full repo).
|
|
48
|
+
This avoids the need to clone the entire results repository and provides
|
|
49
|
+
near-instantaneous loading for most users.
|
|
50
|
+
|
|
51
|
+
2. **Fallback Strategy (Slower)**: If the cached download fails, fall back to
|
|
52
|
+
the original approach of downloading the full results repository and
|
|
53
|
+
building the cache from scratch.
|
|
54
|
+
|
|
55
|
+
The cached results file contains pre-aggregated benchmark data that eliminates
|
|
56
|
+
the need for expensive operations like task selection and revision joining
|
|
57
|
+
during app startup.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
cache: ResultCache instance used for both optimized and fallback operations
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
BenchmarkResults: Complete benchmark results ready for leaderboard display
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
Various exceptions related to network issues, file I/O, or data validation
|
|
67
|
+
are logged and may cause fallback to the slower repository-based approach.
|
|
68
|
+
"""
|
|
69
|
+
start_time = time.time()
|
|
40
70
|
results_cache_path = Path(__file__).parent.joinpath("__cached_results.json")
|
|
71
|
+
|
|
41
72
|
if not results_cache_path.exists():
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
models=all_model_names,
|
|
47
|
-
only_main_score=True,
|
|
48
|
-
require_model_meta=False,
|
|
49
|
-
include_remote=True,
|
|
73
|
+
# First try to download the cached results file from the cached-data branch
|
|
74
|
+
# This is faster than cloning the entire results repository
|
|
75
|
+
logger.info(
|
|
76
|
+
"Cached results not found, trying to download from cached-data branch..."
|
|
50
77
|
)
|
|
51
|
-
|
|
52
|
-
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
# Use ResultCache's optimized download method
|
|
81
|
+
# Default saves to mteb/leaderboard/__cached_results.json
|
|
82
|
+
results_cache_path = cache._download_cached_results_from_branch()
|
|
83
|
+
download_time = time.time() - start_time
|
|
84
|
+
logger.info(
|
|
85
|
+
f"Downloaded cached results from cached-data branch in {download_time:.2f}s"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
except Exception as e:
|
|
89
|
+
logger.error(
|
|
90
|
+
f"Failed to download from cached-data branch: {type(e).__name__}: {e}"
|
|
91
|
+
)
|
|
92
|
+
logger.info("Falling back to downloading full remote repository...")
|
|
93
|
+
|
|
94
|
+
# Fall back to the original approach: clone the full repo
|
|
95
|
+
cache.download_from_remote()
|
|
96
|
+
download_time = time.time() - start_time
|
|
97
|
+
logger.info(f"Downloaded remote results in {download_time:.2f}s")
|
|
98
|
+
|
|
99
|
+
load_start = time.time()
|
|
100
|
+
all_model_names = [model_meta.name for model_meta in mteb.get_model_metas()]
|
|
101
|
+
|
|
102
|
+
all_results = cache.load_results(
|
|
103
|
+
models=all_model_names,
|
|
104
|
+
only_main_score=True,
|
|
105
|
+
require_model_meta=False,
|
|
106
|
+
include_remote=True,
|
|
107
|
+
)
|
|
108
|
+
load_time = time.time() - load_start
|
|
109
|
+
logger.info(f"Loaded results from cache in {load_time:.2f}s")
|
|
110
|
+
return all_results
|
|
111
|
+
|
|
112
|
+
# Load the cached results file (either pre-existing or just downloaded)
|
|
113
|
+
logger.info("Loading cached results from disk...")
|
|
114
|
+
try:
|
|
115
|
+
logger.info(f"Opening file: {results_cache_path}")
|
|
116
|
+
|
|
117
|
+
file_size = results_cache_path.stat().st_size
|
|
118
|
+
logger.info(f"File exists, size: {file_size} bytes")
|
|
119
|
+
|
|
53
120
|
with results_cache_path.open() as cache_file:
|
|
54
|
-
|
|
121
|
+
logger.info("File opened successfully, attempting JSON parse...")
|
|
122
|
+
json_data = json.load(cache_file)
|
|
123
|
+
logger.info(
|
|
124
|
+
f"JSON parsed successfully, keys: {list(json_data.keys()) if isinstance(json_data, dict) else 'not a dict'}"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
logger.info("Attempting BenchmarkResults.from_validated...")
|
|
128
|
+
results = mteb.BenchmarkResults.from_validated(**json_data)
|
|
129
|
+
logger.info("BenchmarkResults.from_validated successful")
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
# TODO: Handle the case when we fail to load cached results from disk.
|
|
133
|
+
logger.error(
|
|
134
|
+
f"Failed to load cached results from disk: {type(e).__name__}: {e}"
|
|
135
|
+
)
|
|
136
|
+
raise
|
|
137
|
+
|
|
138
|
+
total_time = time.time() - start_time
|
|
139
|
+
logger.info(f"Loaded cached results in {total_time:.2f}s")
|
|
140
|
+
return results
|
|
55
141
|
|
|
56
142
|
|
|
57
143
|
def _produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str:
|
|
@@ -107,7 +193,9 @@ def _update_description(
|
|
|
107
193
|
description += f" - **Number of task types**: {n_task_types}\n"
|
|
108
194
|
description += f" - **Number of domains**: {n_domains}\n"
|
|
109
195
|
if benchmark.reference is not None:
|
|
110
|
-
description +=
|
|
196
|
+
description += (
|
|
197
|
+
f'\n<a href="{benchmark.reference}" target="_blank">Click for More Info</a>'
|
|
198
|
+
)
|
|
111
199
|
|
|
112
200
|
return description
|
|
113
201
|
|
|
@@ -137,7 +225,10 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
|
|
|
137
225
|
df["languages"] = df["languages"].map(_format_list)
|
|
138
226
|
df = df.sort_values("name")
|
|
139
227
|
df["domains"] = df["domains"].map(_format_list)
|
|
140
|
-
df["name"] =
|
|
228
|
+
df["name"] = df.apply(
|
|
229
|
+
lambda row: f'<a href="{row["reference"]}" target="_blank">{row["name"]}</a>',
|
|
230
|
+
axis=1,
|
|
231
|
+
)
|
|
141
232
|
df["modalities"] = df["modalities"].map(_format_list)
|
|
142
233
|
df = df.rename(
|
|
143
234
|
columns={
|
|
@@ -154,8 +245,7 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
|
|
|
154
245
|
return gr.DataFrame(
|
|
155
246
|
df,
|
|
156
247
|
datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
|
|
157
|
-
|
|
158
|
-
show_fullscreen_button=True,
|
|
248
|
+
buttons=["copy", "fullscreen"],
|
|
159
249
|
show_search="filter",
|
|
160
250
|
)
|
|
161
251
|
|
|
@@ -172,6 +262,7 @@ def _filter_models(
|
|
|
172
262
|
instructions: bool | None,
|
|
173
263
|
max_model_size: int,
|
|
174
264
|
zero_shot_setting: Literal["only_zero_shot", "allow_all", "remove_unknown"],
|
|
265
|
+
model_types: list[str] | None,
|
|
175
266
|
):
|
|
176
267
|
lower, upper = 0, max_model_size
|
|
177
268
|
# Setting to None, when the user doesn't specify anything
|
|
@@ -190,6 +281,7 @@ def _filter_models(
|
|
|
190
281
|
use_instructions=instructions,
|
|
191
282
|
frameworks=compatibility,
|
|
192
283
|
n_parameters_range=(lower, upper),
|
|
284
|
+
model_types=model_types,
|
|
193
285
|
)
|
|
194
286
|
|
|
195
287
|
models_to_keep = set()
|
|
@@ -213,21 +305,155 @@ def _should_show_zero_shot_filter(benchmark_name: str) -> bool:
|
|
|
213
305
|
return True
|
|
214
306
|
|
|
215
307
|
|
|
308
|
+
@cachetools.cached(
|
|
309
|
+
cache={},
|
|
310
|
+
key=lambda benchmark_name, all_benchmark_results: hash(benchmark_name),
|
|
311
|
+
)
|
|
312
|
+
def _cache_on_benchmark_select(benchmark_name, all_benchmark_results):
|
|
313
|
+
start_time = time.time()
|
|
314
|
+
benchmark = mteb.get_benchmark(benchmark_name)
|
|
315
|
+
languages = [task.languages for task in benchmark.tasks if task.languages]
|
|
316
|
+
languages = set(itertools.chain.from_iterable(languages))
|
|
317
|
+
languages = sorted(languages)
|
|
318
|
+
domains = [
|
|
319
|
+
task.metadata.domains for task in benchmark.tasks if task.metadata.domains
|
|
320
|
+
]
|
|
321
|
+
domains = set(itertools.chain.from_iterable(domains))
|
|
322
|
+
types = {task.metadata.type for task in benchmark.tasks if task.metadata.type}
|
|
323
|
+
modalities = set()
|
|
324
|
+
for task in benchmark.tasks:
|
|
325
|
+
modalities.update(task.metadata.modalities)
|
|
326
|
+
languages, domains, types, modalities = (
|
|
327
|
+
sorted(languages),
|
|
328
|
+
sorted(domains),
|
|
329
|
+
sorted(types),
|
|
330
|
+
sorted(modalities),
|
|
331
|
+
)
|
|
332
|
+
elapsed = time.time() - start_time
|
|
333
|
+
benchmark_results = all_benchmark_results[benchmark_name]
|
|
334
|
+
scores = benchmark_results._get_scores(format="long")
|
|
335
|
+
logger.debug(f"on_benchmark_select callback: {elapsed}s")
|
|
336
|
+
show_zero_shot = _should_show_zero_shot_filter(benchmark_name)
|
|
337
|
+
|
|
338
|
+
# Calculate initial models for this benchmark to avoid race conditions
|
|
339
|
+
benchmark_tasks = sorted([task.metadata.name for task in benchmark.tasks])
|
|
340
|
+
all_models_in_scores = list({entry["model_name"] for entry in scores})
|
|
341
|
+
initial_models = _filter_models(
|
|
342
|
+
all_models_in_scores,
|
|
343
|
+
benchmark_tasks,
|
|
344
|
+
availability=None,
|
|
345
|
+
compatibility=[],
|
|
346
|
+
instructions=None,
|
|
347
|
+
max_model_size=MAX_MODEL_SIZE,
|
|
348
|
+
zero_shot_setting="allow_all",
|
|
349
|
+
model_types=MODEL_TYPE_CHOICES,
|
|
350
|
+
)
|
|
351
|
+
# Sort to ensure consistency with update_models
|
|
352
|
+
initial_models = sorted(initial_models)
|
|
353
|
+
|
|
354
|
+
return (
|
|
355
|
+
languages,
|
|
356
|
+
domains,
|
|
357
|
+
types,
|
|
358
|
+
modalities,
|
|
359
|
+
benchmark_tasks,
|
|
360
|
+
scores,
|
|
361
|
+
show_zero_shot,
|
|
362
|
+
initial_models,
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
@cachetools.cached(
|
|
367
|
+
cache={},
|
|
368
|
+
key=lambda benchmark_name,
|
|
369
|
+
type_select,
|
|
370
|
+
domain_select,
|
|
371
|
+
lang_select,
|
|
372
|
+
modality_select: hash(
|
|
373
|
+
(
|
|
374
|
+
hash(benchmark_name),
|
|
375
|
+
hash(tuple(type_select)),
|
|
376
|
+
hash(tuple(domain_select)),
|
|
377
|
+
hash(tuple(lang_select)),
|
|
378
|
+
hash(tuple(modality_select)),
|
|
379
|
+
)
|
|
380
|
+
),
|
|
381
|
+
)
|
|
382
|
+
def _cache_update_task_list(
|
|
383
|
+
benchmark_name, type_select, domain_select, lang_select, modality_select
|
|
384
|
+
):
|
|
385
|
+
if not len(lang_select):
|
|
386
|
+
return []
|
|
387
|
+
start_time = time.time()
|
|
388
|
+
benchmark_tasks = []
|
|
389
|
+
tasks_to_keep = []
|
|
390
|
+
for task in mteb.get_benchmark(benchmark_name).tasks:
|
|
391
|
+
benchmark_tasks.append(task.metadata.name)
|
|
392
|
+
if task.metadata.type not in type_select:
|
|
393
|
+
continue
|
|
394
|
+
if task.metadata.domains and not (
|
|
395
|
+
set(task.metadata.domains) & set(domain_select)
|
|
396
|
+
):
|
|
397
|
+
continue
|
|
398
|
+
if task.languages and not (set(task.languages) & set(lang_select)):
|
|
399
|
+
continue
|
|
400
|
+
if task.metadata.modalities and not (
|
|
401
|
+
set(task.metadata.modalities) & set(modality_select)
|
|
402
|
+
):
|
|
403
|
+
continue
|
|
404
|
+
tasks_to_keep.append(task.metadata.name)
|
|
405
|
+
benchmark_tasks.sort()
|
|
406
|
+
tasks_to_keep.sort()
|
|
407
|
+
elapsed = time.time() - start_time
|
|
408
|
+
logger.debug(f"update_task_list callback: {elapsed}s")
|
|
409
|
+
|
|
410
|
+
return benchmark_tasks, tasks_to_keep
|
|
411
|
+
|
|
412
|
+
|
|
216
413
|
def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
217
414
|
"""Returns a Gradio Blocks app for the MTEB leaderboard."""
|
|
218
|
-
|
|
415
|
+
app_start = time.time()
|
|
416
|
+
logger.info("=== Starting leaderboard app initialization ===")
|
|
417
|
+
|
|
418
|
+
logger.info("Step 1/7: Loading all benchmark results...")
|
|
419
|
+
load_start = time.time()
|
|
219
420
|
all_results = _load_results(cache)
|
|
421
|
+
load_time = time.time() - load_start
|
|
422
|
+
logger.info(f"Step 1/7 complete: Loaded results in {load_time:.2f}s")
|
|
220
423
|
|
|
424
|
+
logger.info("Step 2/7: Fetching benchmarks...")
|
|
425
|
+
bench_start = time.time()
|
|
221
426
|
benchmarks = sorted(
|
|
222
427
|
mteb.get_benchmarks(display_on_leaderboard=True), key=lambda x: x.name
|
|
223
428
|
)
|
|
429
|
+
bench_time = time.time() - bench_start
|
|
430
|
+
logger.info(
|
|
431
|
+
f"Step 2/7 complete: Fetched {len(benchmarks)} benchmarks in {bench_time:.2f}s"
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
logger.info(
|
|
435
|
+
"Step 3/7: Processing all benchmarks (select_tasks + join_revisions)..."
|
|
436
|
+
)
|
|
437
|
+
process_start = time.time()
|
|
224
438
|
all_benchmark_results = {
|
|
225
439
|
benchmark.name: all_results.select_tasks(benchmark.tasks).join_revisions()
|
|
226
440
|
for benchmark in benchmarks
|
|
227
441
|
}
|
|
442
|
+
process_time = time.time() - process_start
|
|
443
|
+
if len(benchmarks) > 0:
|
|
444
|
+
logger.info(
|
|
445
|
+
f"Step 3/7 complete: Processed {len(benchmarks)} benchmarks in {process_time:.2f}s (avg {process_time / len(benchmarks):.2f}s/benchmark)"
|
|
446
|
+
)
|
|
447
|
+
else:
|
|
448
|
+
logger.info(
|
|
449
|
+
f"Step 3/7 complete: Processed 0 benchmarks in {process_time:.2f}s (avg N/A)"
|
|
450
|
+
)
|
|
451
|
+
|
|
228
452
|
default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME)
|
|
229
453
|
default_results = all_benchmark_results[default_benchmark.name]
|
|
230
|
-
|
|
454
|
+
|
|
455
|
+
logger.info("Step 4/7: Filtering models...")
|
|
456
|
+
filter_start = time.time()
|
|
231
457
|
|
|
232
458
|
default_scores = default_results._get_scores(format="long")
|
|
233
459
|
all_models = list({entry["model_name"] for entry in default_scores})
|
|
@@ -239,6 +465,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
239
465
|
instructions=None,
|
|
240
466
|
max_model_size=MAX_MODEL_SIZE,
|
|
241
467
|
zero_shot_setting="allow_all",
|
|
468
|
+
model_types=MODEL_TYPE_CHOICES,
|
|
242
469
|
)
|
|
243
470
|
default_filtered_scores = [
|
|
244
471
|
entry for entry in default_scores if entry["model_name"] in filtered_models
|
|
@@ -247,63 +474,79 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
247
474
|
# Filter BenchmarkResults based on default filtered models (as required by Kenneth)
|
|
248
475
|
filtered_model_names = [entry["model_name"] for entry in default_filtered_scores]
|
|
249
476
|
filtered_benchmark_results = default_results.select_models(filtered_model_names)
|
|
477
|
+
filter_time = time.time() - filter_start
|
|
478
|
+
logger.info(
|
|
479
|
+
f"Step 4/7 complete: Filtered {len(filtered_model_names)} models in {filter_time:.2f}s"
|
|
480
|
+
)
|
|
250
481
|
|
|
482
|
+
logger.info("Step 5/7: Generating tables...")
|
|
483
|
+
table_start = time.time()
|
|
251
484
|
summary_table = apply_summary_styling_from_benchmark(
|
|
252
485
|
default_benchmark, filtered_benchmark_results
|
|
253
486
|
)
|
|
254
487
|
per_task_table = apply_per_task_styling_from_benchmark(
|
|
255
488
|
default_benchmark, filtered_benchmark_results
|
|
256
489
|
)
|
|
490
|
+
per_language_table = apply_per_language_styling_from_benchmark(
|
|
491
|
+
default_benchmark,
|
|
492
|
+
filtered_benchmark_results,
|
|
493
|
+
)
|
|
494
|
+
table_time = time.time() - table_start
|
|
495
|
+
logger.info(f"Step 5/7 complete: Generated tables in {table_time:.2f}s")
|
|
257
496
|
|
|
258
|
-
|
|
259
|
-
|
|
497
|
+
# Check if this benchmark displays per-language results
|
|
498
|
+
display_language_table = len(default_benchmark.language_view) > 0
|
|
499
|
+
|
|
500
|
+
logger.info("Step 6/7: Creating Gradio components...")
|
|
501
|
+
component_start = time.time()
|
|
502
|
+
lang_select = gr.CheckboxGroup(
|
|
503
|
+
sorted(default_results.languages),
|
|
260
504
|
value=sorted(default_results.languages),
|
|
261
|
-
|
|
262
|
-
|
|
505
|
+
show_label=True,
|
|
506
|
+
show_select_all=True,
|
|
263
507
|
label="Language",
|
|
264
508
|
info="Select languages to include.",
|
|
265
509
|
)
|
|
266
|
-
type_select = gr.
|
|
267
|
-
sorted(
|
|
510
|
+
type_select = gr.CheckboxGroup(
|
|
511
|
+
sorted(default_results.task_types),
|
|
268
512
|
value=sorted(default_results.task_types),
|
|
269
|
-
|
|
513
|
+
show_label=True,
|
|
514
|
+
show_select_all=True,
|
|
270
515
|
label="Task Type",
|
|
271
516
|
info="Select task types to include.",
|
|
272
517
|
)
|
|
273
|
-
domain_select = gr.
|
|
274
|
-
sorted(
|
|
518
|
+
domain_select = gr.CheckboxGroup(
|
|
519
|
+
sorted(default_results.domains),
|
|
275
520
|
value=sorted(default_results.domains),
|
|
276
|
-
|
|
521
|
+
show_label=True,
|
|
522
|
+
show_select_all=True,
|
|
277
523
|
label="Domain",
|
|
278
524
|
info="Select domains to include.",
|
|
279
525
|
)
|
|
280
|
-
task_select = gr.
|
|
281
|
-
sorted(
|
|
526
|
+
task_select = gr.CheckboxGroup(
|
|
527
|
+
sorted(default_results.task_names),
|
|
282
528
|
value=sorted(default_results.task_names),
|
|
283
|
-
|
|
284
|
-
|
|
529
|
+
show_label=True,
|
|
530
|
+
show_select_all=True,
|
|
285
531
|
label="Task",
|
|
286
532
|
info="Select specific tasks to include",
|
|
287
533
|
)
|
|
288
|
-
modality_select = gr.
|
|
289
|
-
sorted(
|
|
534
|
+
modality_select = gr.CheckboxGroup(
|
|
535
|
+
sorted(default_results.modalities),
|
|
290
536
|
value=sorted(default_results.modalities),
|
|
291
|
-
|
|
537
|
+
show_label=True,
|
|
538
|
+
show_select_all=True,
|
|
292
539
|
label="Modality",
|
|
293
540
|
info="Select modalities to include.",
|
|
294
541
|
)
|
|
542
|
+
component_time = time.time() - component_start
|
|
543
|
+
logger.info(
|
|
544
|
+
f"Step 6/7 complete: Created Gradio components in {component_time:.2f}s"
|
|
545
|
+
)
|
|
295
546
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
with gr.Blocks(
|
|
301
|
-
fill_width=True,
|
|
302
|
-
theme=gr.themes.Soft(
|
|
303
|
-
font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
|
|
304
|
-
),
|
|
305
|
-
head=head,
|
|
306
|
-
) as demo:
|
|
547
|
+
logger.info("Step 7/7: Building Gradio interface and callbacks...")
|
|
548
|
+
interface_start = time.time()
|
|
549
|
+
with gr.Blocks(fill_width=True) as demo:
|
|
307
550
|
with gr.Sidebar(
|
|
308
551
|
position="left",
|
|
309
552
|
label="Benchmark Selection and Customization",
|
|
@@ -419,6 +662,12 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
419
662
|
label="Model Parameters",
|
|
420
663
|
interactive=True,
|
|
421
664
|
)
|
|
665
|
+
with gr.Column():
|
|
666
|
+
model_type_select = gr.CheckboxGroup(
|
|
667
|
+
MODEL_TYPE_CHOICES,
|
|
668
|
+
value=MODEL_TYPE_CHOICES,
|
|
669
|
+
label="Model Type",
|
|
670
|
+
)
|
|
422
671
|
|
|
423
672
|
with gr.Tab("Summary"):
|
|
424
673
|
summary_table.render()
|
|
@@ -435,9 +684,6 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
435
684
|
|
|
436
685
|
with gr.Tab("Performance per Model Size") as plot_tab:
|
|
437
686
|
plot = gr.Plot(_performance_size_plot, inputs=[summary_table])
|
|
438
|
-
gr.Markdown(
|
|
439
|
-
"*We only display TOP 5 models that have been run on all tasks in the benchmark*"
|
|
440
|
-
)
|
|
441
687
|
plot_tab.select(
|
|
442
688
|
_performance_size_plot, inputs=[summary_table], outputs=[plot]
|
|
443
689
|
)
|
|
@@ -457,68 +703,41 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
457
703
|
download_per_task.click(
|
|
458
704
|
_download_table, inputs=[per_task_table], outputs=[download_per_task]
|
|
459
705
|
)
|
|
706
|
+
with gr.Tab(
|
|
707
|
+
"Performance per language", visible=display_language_table
|
|
708
|
+
) as language_tab:
|
|
709
|
+
per_language_table.render()
|
|
710
|
+
download_per_language = gr.DownloadButton("Download Table")
|
|
711
|
+
download_per_language.click(
|
|
712
|
+
_download_table,
|
|
713
|
+
inputs=[per_language_table],
|
|
714
|
+
outputs=[download_per_language],
|
|
715
|
+
)
|
|
460
716
|
with gr.Tab("Task information"):
|
|
461
717
|
task_info_table = gr.DataFrame(_update_task_info, inputs=[task_select]) # noqa: F841
|
|
462
718
|
|
|
463
719
|
# This sets the benchmark from the URL query parameters
|
|
464
720
|
demo.load(_set_benchmark_on_load, inputs=[], outputs=[benchmark_select])
|
|
465
721
|
|
|
466
|
-
@cachetools.cached(
|
|
467
|
-
cache={},
|
|
468
|
-
key=lambda benchmark_name: hash(benchmark_name),
|
|
469
|
-
)
|
|
470
722
|
def on_benchmark_select(benchmark_name):
|
|
471
|
-
|
|
472
|
-
benchmark = mteb.get_benchmark(benchmark_name)
|
|
473
|
-
languages = [task.languages for task in benchmark.tasks if task.languages]
|
|
474
|
-
languages = set(itertools.chain.from_iterable(languages))
|
|
475
|
-
languages = sorted(languages)
|
|
476
|
-
domains = [
|
|
477
|
-
task.metadata.domains
|
|
478
|
-
for task in benchmark.tasks
|
|
479
|
-
if task.metadata.domains
|
|
480
|
-
]
|
|
481
|
-
domains = set(itertools.chain.from_iterable(domains))
|
|
482
|
-
types = {
|
|
483
|
-
task.metadata.type for task in benchmark.tasks if task.metadata.type
|
|
484
|
-
}
|
|
485
|
-
modalities = set()
|
|
486
|
-
for task in benchmark.tasks:
|
|
487
|
-
modalities.update(task.metadata.modalities)
|
|
488
|
-
languages, domains, types, modalities = (
|
|
489
|
-
sorted(languages),
|
|
490
|
-
sorted(domains),
|
|
491
|
-
sorted(types),
|
|
492
|
-
sorted(modalities),
|
|
493
|
-
)
|
|
494
|
-
elapsed = time.time() - start_time
|
|
495
|
-
benchmark_results = all_benchmark_results[benchmark_name]
|
|
496
|
-
scores = benchmark_results._get_scores(format="long")
|
|
497
|
-
logger.debug(f"on_benchmark_select callback: {elapsed}s")
|
|
498
|
-
show_zero_shot = _should_show_zero_shot_filter(benchmark_name)
|
|
499
|
-
|
|
500
|
-
# Calculate initial models for this benchmark to avoid race conditions
|
|
501
|
-
benchmark_tasks = sorted([task.metadata.name for task in benchmark.tasks])
|
|
502
|
-
all_models_in_scores = list({entry["model_name"] for entry in scores})
|
|
503
|
-
initial_models = _filter_models(
|
|
504
|
-
all_models_in_scores,
|
|
505
|
-
benchmark_tasks,
|
|
506
|
-
availability=None,
|
|
507
|
-
compatibility=[],
|
|
508
|
-
instructions=None,
|
|
509
|
-
max_model_size=MAX_MODEL_SIZE,
|
|
510
|
-
zero_shot_setting="allow_all",
|
|
511
|
-
)
|
|
512
|
-
# Sort to ensure consistency with update_models
|
|
513
|
-
initial_models = sorted(initial_models)
|
|
514
|
-
|
|
515
|
-
return (
|
|
723
|
+
(
|
|
516
724
|
languages,
|
|
517
725
|
domains,
|
|
518
726
|
types,
|
|
519
727
|
modalities,
|
|
520
728
|
benchmark_tasks,
|
|
521
729
|
scores,
|
|
730
|
+
show_zero_shot,
|
|
731
|
+
initial_models,
|
|
732
|
+
) = _cache_on_benchmark_select(benchmark_name, all_benchmark_results)
|
|
733
|
+
|
|
734
|
+
return (
|
|
735
|
+
gr.update(choices=languages, value=languages),
|
|
736
|
+
gr.update(choices=domains, value=domains),
|
|
737
|
+
gr.update(choices=types, value=types),
|
|
738
|
+
gr.update(choices=modalities, value=modalities),
|
|
739
|
+
gr.update(choices=benchmark_tasks, value=benchmark_tasks),
|
|
740
|
+
scores,
|
|
522
741
|
gr.update(visible=show_zero_shot),
|
|
523
742
|
initial_models,
|
|
524
743
|
)
|
|
@@ -560,48 +779,13 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
560
779
|
outputs=[scores],
|
|
561
780
|
)
|
|
562
781
|
|
|
563
|
-
@cachetools.cached(
|
|
564
|
-
cache={},
|
|
565
|
-
key=lambda benchmark_name,
|
|
566
|
-
type_select,
|
|
567
|
-
domain_select,
|
|
568
|
-
lang_select,
|
|
569
|
-
modality_select: hash(
|
|
570
|
-
(
|
|
571
|
-
hash(benchmark_name),
|
|
572
|
-
hash(tuple(type_select)),
|
|
573
|
-
hash(tuple(domain_select)),
|
|
574
|
-
hash(tuple(lang_select)),
|
|
575
|
-
hash(tuple(modality_select)),
|
|
576
|
-
)
|
|
577
|
-
),
|
|
578
|
-
)
|
|
579
782
|
def update_task_list(
|
|
580
783
|
benchmark_name, type_select, domain_select, lang_select, modality_select
|
|
581
784
|
):
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
for task in mteb.get_benchmark(benchmark_name).tasks:
|
|
587
|
-
if task.metadata.type not in type_select:
|
|
588
|
-
continue
|
|
589
|
-
if task.metadata.domains is not None and not (
|
|
590
|
-
set(task.metadata.domains) & set(domain_select)
|
|
591
|
-
):
|
|
592
|
-
continue
|
|
593
|
-
if task.languages is not None and not (
|
|
594
|
-
set(task.languages) & set(lang_select)
|
|
595
|
-
):
|
|
596
|
-
continue
|
|
597
|
-
if task.metadata.modalities and not (
|
|
598
|
-
set(task.metadata.modalities) & set(modality_select)
|
|
599
|
-
):
|
|
600
|
-
continue
|
|
601
|
-
tasks_to_keep.append(task.metadata.name)
|
|
602
|
-
elapsed = time.time() - start_time
|
|
603
|
-
logger.debug(f"update_task_list callback: {elapsed}s")
|
|
604
|
-
return sorted(tasks_to_keep)
|
|
785
|
+
benchmark_tasks, tasks_to_keep = _cache_update_task_list(
|
|
786
|
+
benchmark_name, type_select, domain_select, lang_select, modality_select
|
|
787
|
+
)
|
|
788
|
+
return gr.update(choices=benchmark_tasks, value=tasks_to_keep)
|
|
605
789
|
|
|
606
790
|
type_select.input(
|
|
607
791
|
update_task_list,
|
|
@@ -656,7 +840,8 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
656
840
|
compatibility,
|
|
657
841
|
instructions,
|
|
658
842
|
max_model_size,
|
|
659
|
-
zero_shot
|
|
843
|
+
zero_shot,
|
|
844
|
+
model_type_select: hash(
|
|
660
845
|
(
|
|
661
846
|
id(scores),
|
|
662
847
|
hash(tuple(tasks)),
|
|
@@ -665,6 +850,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
665
850
|
hash(instructions),
|
|
666
851
|
hash(max_model_size),
|
|
667
852
|
hash(zero_shot),
|
|
853
|
+
hash(tuple(model_type_select)),
|
|
668
854
|
)
|
|
669
855
|
),
|
|
670
856
|
)
|
|
@@ -676,6 +862,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
676
862
|
instructions: bool | None,
|
|
677
863
|
max_model_size: int,
|
|
678
864
|
zero_shot: Literal["allow_all", "remove_unknown", "only_zero_shot"],
|
|
865
|
+
model_type_select: list[str],
|
|
679
866
|
):
|
|
680
867
|
start_time = time.time()
|
|
681
868
|
model_names = list({entry["model_name"] for entry in scores})
|
|
@@ -687,6 +874,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
687
874
|
instructions,
|
|
688
875
|
max_model_size,
|
|
689
876
|
zero_shot_setting=zero_shot,
|
|
877
|
+
model_types=model_type_select,
|
|
690
878
|
)
|
|
691
879
|
elapsed = time.time() - start_time
|
|
692
880
|
logger.debug(f"update_models callback: {elapsed}s")
|
|
@@ -704,6 +892,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
704
892
|
instructions,
|
|
705
893
|
max_model_size,
|
|
706
894
|
zero_shot,
|
|
895
|
+
model_type_select,
|
|
707
896
|
],
|
|
708
897
|
outputs=[models],
|
|
709
898
|
)
|
|
@@ -718,6 +907,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
718
907
|
instructions,
|
|
719
908
|
max_model_size,
|
|
720
909
|
zero_shot,
|
|
910
|
+
model_type_select,
|
|
721
911
|
],
|
|
722
912
|
outputs=[models],
|
|
723
913
|
)
|
|
@@ -731,6 +921,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
731
921
|
instructions,
|
|
732
922
|
max_model_size,
|
|
733
923
|
zero_shot,
|
|
924
|
+
model_type_select,
|
|
734
925
|
],
|
|
735
926
|
outputs=[models],
|
|
736
927
|
)
|
|
@@ -744,6 +935,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
744
935
|
instructions,
|
|
745
936
|
max_model_size,
|
|
746
937
|
zero_shot,
|
|
938
|
+
model_type_select,
|
|
747
939
|
],
|
|
748
940
|
outputs=[models],
|
|
749
941
|
)
|
|
@@ -757,6 +949,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
757
949
|
instructions,
|
|
758
950
|
max_model_size,
|
|
759
951
|
zero_shot,
|
|
952
|
+
model_type_select,
|
|
760
953
|
],
|
|
761
954
|
outputs=[models],
|
|
762
955
|
)
|
|
@@ -770,6 +963,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
770
963
|
instructions,
|
|
771
964
|
max_model_size,
|
|
772
965
|
zero_shot,
|
|
966
|
+
model_type_select,
|
|
773
967
|
],
|
|
774
968
|
outputs=[models],
|
|
775
969
|
)
|
|
@@ -783,6 +977,21 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
783
977
|
instructions,
|
|
784
978
|
max_model_size,
|
|
785
979
|
zero_shot,
|
|
980
|
+
model_type_select,
|
|
981
|
+
],
|
|
982
|
+
outputs=[models],
|
|
983
|
+
)
|
|
984
|
+
model_type_select.change(
|
|
985
|
+
update_models,
|
|
986
|
+
inputs=[
|
|
987
|
+
scores,
|
|
988
|
+
task_select,
|
|
989
|
+
availability,
|
|
990
|
+
compatibility,
|
|
991
|
+
instructions,
|
|
992
|
+
max_model_size,
|
|
993
|
+
zero_shot,
|
|
994
|
+
model_type_select,
|
|
786
995
|
],
|
|
787
996
|
outputs=[models],
|
|
788
997
|
)
|
|
@@ -854,9 +1063,18 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
854
1063
|
per_task = apply_per_task_styling_from_benchmark(
|
|
855
1064
|
benchmark, filtered_benchmark_results
|
|
856
1065
|
)
|
|
1066
|
+
per_language = apply_per_language_styling_from_benchmark(
|
|
1067
|
+
benchmark,
|
|
1068
|
+
filtered_benchmark_results,
|
|
1069
|
+
)
|
|
857
1070
|
elapsed = time.time() - start_time
|
|
858
1071
|
logger.debug(f"update_tables callback: {elapsed}s")
|
|
859
|
-
return
|
|
1072
|
+
return (
|
|
1073
|
+
summary,
|
|
1074
|
+
per_task,
|
|
1075
|
+
per_language,
|
|
1076
|
+
gr.update(visible=len(benchmark.language_view) > 0),
|
|
1077
|
+
)
|
|
860
1078
|
|
|
861
1079
|
# Only update tables when models change, not when scores/tasks change directly
|
|
862
1080
|
# This avoids redundant updates since scores/tasks changes trigger update_models
|
|
@@ -865,11 +1083,20 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
865
1083
|
item.change(
|
|
866
1084
|
update_tables,
|
|
867
1085
|
inputs=[scores, task_select, models, benchmark_select],
|
|
868
|
-
outputs=[
|
|
1086
|
+
outputs=[
|
|
1087
|
+
summary_table,
|
|
1088
|
+
per_task_table,
|
|
1089
|
+
per_language_table,
|
|
1090
|
+
language_tab,
|
|
1091
|
+
],
|
|
869
1092
|
)
|
|
870
1093
|
|
|
871
1094
|
gr.Markdown(ACKNOWLEDGEMENT, elem_id="ack_markdown")
|
|
1095
|
+
interface_time = time.time() - interface_start
|
|
1096
|
+
logger.info(f"Step 7/7 complete: Built Gradio interface in {interface_time:.2f}s")
|
|
872
1097
|
|
|
1098
|
+
logger.info("Starting prerun on all benchmarks to populate caches...")
|
|
1099
|
+
prerun_start = time.time()
|
|
873
1100
|
# Prerun on all benchmarks, so that results of callbacks get cached
|
|
874
1101
|
for benchmark in benchmarks:
|
|
875
1102
|
(
|
|
@@ -895,20 +1122,56 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
895
1122
|
update_tables(
|
|
896
1123
|
bench_scores, filtered_tasks, bench_initial_models, benchmark.name
|
|
897
1124
|
)
|
|
1125
|
+
prerun_time = time.time() - prerun_start
|
|
1126
|
+
logger.info(
|
|
1127
|
+
f"Prerun complete: Processed {len(benchmarks)} benchmarks in {prerun_time:.2f}s"
|
|
1128
|
+
)
|
|
1129
|
+
|
|
1130
|
+
total_time = time.time() - app_start
|
|
1131
|
+
logger.info(f"=== Leaderboard app initialization complete in {total_time:.2f}s ===")
|
|
898
1132
|
return demo
|
|
899
1133
|
|
|
900
1134
|
|
|
901
1135
|
if __name__ == "__main__":
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
logging.
|
|
906
|
-
logging.
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
1136
|
+
import os
|
|
1137
|
+
|
|
1138
|
+
# Add process ID to logging for multiprocessing debugging
|
|
1139
|
+
logging.basicConfig(
|
|
1140
|
+
level=logging.INFO,
|
|
1141
|
+
format="%(asctime)s - PID:%(process)d - %(name)s - %(levelname)s - %(message)s",
|
|
1142
|
+
force=True, # Override any existing handlers
|
|
1143
|
+
)
|
|
1144
|
+
|
|
1145
|
+
# Flush log handlers immediately (helpful for multiprocessing)
|
|
1146
|
+
for handler in logging.root.handlers:
|
|
1147
|
+
handler.flush()
|
|
1148
|
+
|
|
1149
|
+
logger.info(f"Starting leaderboard app in process {os.getpid()}")
|
|
1150
|
+
|
|
1151
|
+
# Suppress specific WARNING messages while keeping INFO level for the app
|
|
1152
|
+
logging.getLogger("mteb.results.task_result").setLevel(logging.ERROR)
|
|
1153
|
+
logging.getLogger("mteb.models.model_meta").setLevel(logging.ERROR)
|
|
1154
|
+
logging.getLogger("mteb.results.benchmark_results").setLevel(logging.ERROR)
|
|
1155
|
+
|
|
911
1156
|
warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*")
|
|
1157
|
+
warnings.filterwarnings("ignore", message="Could not get source model: .*")
|
|
1158
|
+
warnings.filterwarnings(
|
|
1159
|
+
"ignore", message="No scores data available. Returning empty DataFrame."
|
|
1160
|
+
)
|
|
1161
|
+
warnings.filterwarnings("ignore", message="Main score .* not found in scores")
|
|
1162
|
+
warnings.filterwarnings("ignore", message=".*: Missing subsets .* for split .*")
|
|
1163
|
+
warnings.filterwarnings("ignore", message=".*: Missing splits .*")
|
|
912
1164
|
|
|
913
1165
|
app = get_leaderboard_app()
|
|
914
|
-
|
|
1166
|
+
|
|
1167
|
+
head = """
|
|
1168
|
+
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
|
|
1169
|
+
"""
|
|
1170
|
+
app.launch(
|
|
1171
|
+
server_name="0.0.0.0",
|
|
1172
|
+
server_port=7860,
|
|
1173
|
+
theme=gr.themes.Soft(
|
|
1174
|
+
font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
|
|
1175
|
+
),
|
|
1176
|
+
head=head,
|
|
1177
|
+
)
|