mteb 2.7.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +63 -14
- mteb/_evaluators/any_sts_evaluator.py +12 -5
- mteb/_evaluators/clustering_evaluator.py +12 -4
- mteb/_evaluators/evaluator.py +11 -5
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +14 -5
- mteb/_evaluators/pair_classification_evaluator.py +13 -5
- mteb/_evaluators/retrieval_evaluator.py +22 -13
- mteb/_evaluators/retrieval_metrics.py +9 -3
- mteb/_evaluators/sklearn_evaluator.py +20 -11
- mteb/_evaluators/text/bitext_mining_evaluator.py +10 -3
- mteb/_evaluators/text/summarization_evaluator.py +10 -4
- mteb/_evaluators/zeroshot_classification_evaluator.py +12 -3
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +8 -2
- mteb/abstasks/_data_filter/task_pipelines.py +7 -2
- mteb/abstasks/_statistics_calculation.py +6 -4
- mteb/abstasks/abstask.py +48 -21
- mteb/abstasks/aggregate_task_metadata.py +20 -9
- mteb/abstasks/aggregated_task.py +15 -8
- mteb/abstasks/classification.py +25 -9
- mteb/abstasks/clustering.py +23 -10
- mteb/abstasks/clustering_legacy.py +22 -8
- mteb/abstasks/image/image_text_pair_classification.py +23 -9
- mteb/abstasks/multilabel_classification.py +13 -5
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +14 -6
- mteb/abstasks/retrieval.py +56 -30
- mteb/abstasks/retrieval_dataset_loaders.py +48 -37
- mteb/abstasks/sts.py +29 -13
- mteb/abstasks/task_metadata.py +17 -8
- mteb/abstasks/text/bitext_mining.py +23 -12
- mteb/abstasks/text/reranking.py +2 -2
- mteb/abstasks/text/summarization.py +19 -8
- mteb/abstasks/zeroshot_classification.py +23 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +11 -1
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +41 -2
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/cache.py +10 -5
- mteb/cli/_display_tasks.py +9 -3
- mteb/cli/build_cli.py +5 -2
- mteb/cli/generate_model_card.py +9 -2
- mteb/deprecated_evaluator.py +16 -12
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/evaluate.py +33 -20
- mteb/filter_tasks.py +12 -7
- mteb/get_tasks.py +9 -4
- mteb/languages/language_scripts.py +8 -3
- mteb/leaderboard/app.py +11 -4
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +9 -3
- mteb/models/abs_encoder.py +22 -12
- mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
- mteb/models/cache_wrappers/cache_wrapper.py +14 -9
- mteb/models/get_model_meta.py +32 -6
- mteb/models/instruct_wrapper.py +13 -5
- mteb/models/model_implementations/align_models.py +10 -4
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +2 -0
- mteb/models/model_implementations/ara_models.py +1 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +40 -1
- mteb/models/model_implementations/bica_model.py +1 -0
- mteb/models/model_implementations/blip2_models.py +11 -4
- mteb/models/model_implementations/blip_models.py +17 -4
- mteb/models/model_implementations/bm25.py +24 -14
- mteb/models/model_implementations/bmretriever_models.py +10 -2
- mteb/models/model_implementations/cadet_models.py +1 -0
- mteb/models/model_implementations/cde_models.py +11 -5
- mteb/models/model_implementations/clip_models.py +12 -4
- mteb/models/model_implementations/clips_models.py +3 -0
- mteb/models/model_implementations/codefuse_models.py +5 -0
- mteb/models/model_implementations/codesage_models.py +3 -0
- mteb/models/model_implementations/cohere_models.py +14 -4
- mteb/models/model_implementations/cohere_v.py +14 -4
- mteb/models/model_implementations/colpali_models.py +7 -3
- mteb/models/model_implementations/colqwen_models.py +17 -31
- mteb/models/model_implementations/colsmol_models.py +3 -1
- mteb/models/model_implementations/conan_models.py +11 -4
- mteb/models/model_implementations/dino_models.py +28 -4
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +9 -0
- mteb/models/model_implementations/e5_v.py +10 -4
- mteb/models/model_implementations/eagerworks_models.py +11 -4
- mteb/models/model_implementations/emillykkejensen_models.py +3 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +1 -0
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +9 -0
- mteb/models/model_implementations/facebookai.py +2 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +7 -3
- mteb/models/model_implementations/google_models.py +15 -4
- mteb/models/model_implementations/granite_vision_embedding_models.py +7 -5
- mteb/models/model_implementations/gritlm_models.py +3 -0
- mteb/models/model_implementations/gte_models.py +9 -0
- mteb/models/model_implementations/hinvec_models.py +6 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +14 -5
- mteb/models/model_implementations/jina_clip.py +10 -4
- mteb/models/model_implementations/jina_models.py +17 -5
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +1 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
- mteb/models/model_implementations/kfst.py +1 -0
- mteb/models/model_implementations/kowshik24_models.py +1 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +7 -1
- mteb/models/model_implementations/listconranker.py +10 -4
- mteb/models/model_implementations/llm2clip_models.py +12 -4
- mteb/models/model_implementations/llm2vec_models.py +20 -6
- mteb/models/model_implementations/mcinext_models.py +8 -2
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +63 -0
- mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +11 -4
- mteb/models/model_implementations/mod_models.py +2 -1
- mteb/models/model_implementations/model2vec_models.py +23 -4
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/nbailab.py +3 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +13 -5
- mteb/models/model_implementations/nomic_models.py +17 -4
- mteb/models/model_implementations/nomic_models_vision.py +5 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +9 -3
- mteb/models/model_implementations/nvidia_models.py +15 -4
- mteb/models/model_implementations/octen_models.py +3 -1
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +17 -4
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +15 -4
- mteb/models/model_implementations/ops_moa_models.py +9 -2
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
- mteb/models/model_implementations/pawan_models.py +1 -0
- mteb/models/model_implementations/piccolo_models.py +2 -0
- mteb/models/model_implementations/promptriever_models.py +16 -6
- mteb/models/model_implementations/pylate_models.py +32 -13
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +11 -1
- mteb/models/model_implementations/qzhou_models.py +2 -0
- mteb/models/model_implementations/random_baseline.py +4 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -0
- mteb/models/model_implementations/reasonir_model.py +65 -0
- mteb/models/model_implementations/repllama_models.py +15 -6
- mteb/models/model_implementations/rerankers_custom.py +13 -4
- mteb/models/model_implementations/rerankers_monot5_based.py +24 -4
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +20 -0
- mteb/models/model_implementations/ruri_models.py +10 -0
- mteb/models/model_implementations/salesforce_models.py +10 -1
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +6 -2
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +18 -0
- mteb/models/model_implementations/shuu_model.py +1 -0
- mteb/models/model_implementations/siglip_models.py +19 -4
- mteb/models/model_implementations/slm_models.py +7 -4
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +2 -0
- mteb/models/model_implementations/text2vec_models.py +3 -0
- mteb/models/model_implementations/ua_sentence_models.py +1 -0
- mteb/models/model_implementations/uae_models.py +10 -4
- mteb/models/model_implementations/vdr_models.py +8 -1
- mteb/models/model_implementations/vi_vn_models.py +6 -0
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +11 -4
- mteb/models/model_implementations/voyage_models.py +52 -4
- mteb/models/model_implementations/voyage_v.py +11 -6
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +1 -0
- mteb/models/model_implementations/yuan_models_en.py +2 -1
- mteb/models/model_meta.py +47 -9
- mteb/models/models_protocols.py +23 -18
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
- mteb/models/search_wrappers.py +31 -12
- mteb/models/sentence_transformer_wrapper.py +4 -3
- mteb/models/vllm_wrapper.py +8 -6
- mteb/results/benchmark_results.py +22 -17
- mteb/results/model_result.py +21 -15
- mteb/results/task_result.py +32 -16
- mteb/similarity_functions.py +8 -2
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +4 -4
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +42 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +1 -1
- mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/_encoder_io.py +1 -1
- mteb/types/statistics.py +9 -2
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/METADATA +1 -1
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/RECORD +486 -465
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"standard": {
|
|
3
|
+
"num_samples": 23904,
|
|
4
|
+
"number_of_characters": 20825122,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 20797224,
|
|
7
|
+
"min_text_length": 74,
|
|
8
|
+
"average_text_length": 872.4033726246906,
|
|
9
|
+
"max_text_length": 19104,
|
|
10
|
+
"unique_texts": 23839
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 27898,
|
|
15
|
+
"min_text_length": 13,
|
|
16
|
+
"average_text_length": 429.2,
|
|
17
|
+
"max_text_length": 1255,
|
|
18
|
+
"unique_texts": 65
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 126,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 1.9384615384615385,
|
|
25
|
+
"max_relevant_docs_per_query": 6,
|
|
26
|
+
"unique_relevant_docs": 95
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": {
|
|
29
|
+
"num_top_ranked": 1549535,
|
|
30
|
+
"min_top_ranked_per_query": 23839,
|
|
31
|
+
"average_top_ranked_per_query": 23839.0,
|
|
32
|
+
"max_top_ranked_per_query": 23839
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
mteb/evaluate.py
CHANGED
|
@@ -2,7 +2,6 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import warnings
|
|
5
|
-
from collections.abc import Iterable
|
|
6
5
|
from pathlib import Path
|
|
7
6
|
from time import time
|
|
8
7
|
from typing import TYPE_CHECKING, cast
|
|
@@ -17,22 +16,25 @@ from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
|
17
16
|
from mteb.benchmarks.benchmark import Benchmark
|
|
18
17
|
from mteb.cache import ResultCache
|
|
19
18
|
from mteb.models.model_meta import ModelMeta
|
|
20
|
-
from mteb.models.models_protocols import (
|
|
21
|
-
MTEBModels,
|
|
22
|
-
)
|
|
23
19
|
from mteb.models.sentence_transformer_wrapper import (
|
|
24
20
|
CrossEncoderWrapper,
|
|
25
21
|
SentenceTransformerEncoderWrapper,
|
|
26
22
|
)
|
|
27
23
|
from mteb.results import ModelResult, TaskResult
|
|
28
24
|
from mteb.results.task_result import TaskError
|
|
29
|
-
from mteb.types import
|
|
30
|
-
from mteb.types._encoder_io import EncodeKwargs
|
|
31
|
-
from mteb.types._metadata import ModelName, Revision
|
|
25
|
+
from mteb.types import PromptType
|
|
32
26
|
|
|
33
27
|
if TYPE_CHECKING:
|
|
28
|
+
from collections.abc import Iterable
|
|
29
|
+
|
|
34
30
|
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
35
31
|
|
|
32
|
+
from mteb.models.models_protocols import (
|
|
33
|
+
MTEBModels,
|
|
34
|
+
)
|
|
35
|
+
from mteb.types import EncodeKwargs, HFSubset, SplitName
|
|
36
|
+
from mteb.types._metadata import ModelName, Revision
|
|
37
|
+
|
|
36
38
|
logger = logging.getLogger(__name__)
|
|
37
39
|
|
|
38
40
|
|
|
@@ -69,13 +71,13 @@ def _sanitize_model(
|
|
|
69
71
|
meta = getattr(model, "mteb_model_meta")
|
|
70
72
|
if not isinstance(meta, ModelMeta):
|
|
71
73
|
meta = ModelMeta._from_hub(None)
|
|
72
|
-
wrapped_model = cast(MTEBModels | ModelMeta, model)
|
|
74
|
+
wrapped_model = cast("MTEBModels | ModelMeta", model)
|
|
73
75
|
else:
|
|
74
76
|
meta = ModelMeta._from_hub(None) if not isinstance(model, ModelMeta) else model
|
|
75
77
|
wrapped_model = meta
|
|
76
78
|
|
|
77
|
-
model_name = cast(str, meta.name)
|
|
78
|
-
model_revision = cast(str, meta.revision)
|
|
79
|
+
model_name = cast("str", meta.name)
|
|
80
|
+
model_revision = cast("str", meta.revision)
|
|
79
81
|
|
|
80
82
|
return wrapped_model, meta, model_name, model_revision
|
|
81
83
|
|
|
@@ -123,6 +125,7 @@ def _evaluate_task(
|
|
|
123
125
|
co2_tracker=False,
|
|
124
126
|
prediction_folder=prediction_folder,
|
|
125
127
|
public_only=public_only,
|
|
128
|
+
num_proc=num_proc,
|
|
126
129
|
)
|
|
127
130
|
if isinstance(result, TaskResult):
|
|
128
131
|
result.kg_co2_emissions = tracker.final_emissions
|
|
@@ -132,10 +135,10 @@ def _evaluate_task(
|
|
|
132
135
|
|
|
133
136
|
task.check_if_dataset_is_superseded()
|
|
134
137
|
|
|
135
|
-
|
|
136
|
-
if not
|
|
138
|
+
data_preloaded = task.data_loaded
|
|
139
|
+
if not data_preloaded:
|
|
137
140
|
try:
|
|
138
|
-
task.load_data()
|
|
141
|
+
task.load_data(num_proc=num_proc)
|
|
139
142
|
except DatasetNotFoundError as e:
|
|
140
143
|
if not task.metadata.is_public and public_only is None:
|
|
141
144
|
msg = (
|
|
@@ -161,6 +164,7 @@ def _evaluate_task(
|
|
|
161
164
|
subsets_to_run=hf_subsets,
|
|
162
165
|
encode_kwargs=encode_kwargs,
|
|
163
166
|
prediction_folder=prediction_folder,
|
|
167
|
+
num_proc=num_proc,
|
|
164
168
|
)
|
|
165
169
|
tock = time()
|
|
166
170
|
|
|
@@ -176,7 +180,7 @@ def _evaluate_task(
|
|
|
176
180
|
kg_co2_emissions=None,
|
|
177
181
|
)
|
|
178
182
|
|
|
179
|
-
if
|
|
183
|
+
if not data_preloaded: # only unload if we loaded the data
|
|
180
184
|
task.unload_data()
|
|
181
185
|
|
|
182
186
|
return result
|
|
@@ -202,10 +206,10 @@ def _check_model_modalities(
|
|
|
202
206
|
if isinstance(tasks, AbsTask):
|
|
203
207
|
check_tasks = [tasks]
|
|
204
208
|
elif isinstance(tasks, Benchmark):
|
|
205
|
-
benchmark = cast(Benchmark, tasks)
|
|
209
|
+
benchmark = cast("Benchmark", tasks)
|
|
206
210
|
check_tasks = benchmark.tasks
|
|
207
211
|
else:
|
|
208
|
-
check_tasks = cast(Iterable[AbsTask], tasks)
|
|
212
|
+
check_tasks = cast("Iterable[AbsTask]", tasks)
|
|
209
213
|
|
|
210
214
|
warnings, errors = [], []
|
|
211
215
|
|
|
@@ -278,6 +282,7 @@ def evaluate(
|
|
|
278
282
|
prediction_folder: Path | str | None = None,
|
|
279
283
|
show_progress_bar: bool = True,
|
|
280
284
|
public_only: bool | None = None,
|
|
285
|
+
num_proc: int = 1,
|
|
281
286
|
) -> ModelResult:
|
|
282
287
|
"""This function runs a model on a given task and returns the results.
|
|
283
288
|
|
|
@@ -286,7 +291,7 @@ def evaluate(
|
|
|
286
291
|
tasks: A task to run.
|
|
287
292
|
co2_tracker: If True, track the CO₂ emissions of the evaluation, required codecarbon to be installed, which can be installed using
|
|
288
293
|
`pip install mteb[codecarbon]`. If none is passed co2 tracking will only be run if codecarbon is installed.
|
|
289
|
-
encode_kwargs: Additional keyword arguments passed to the models `encode`
|
|
294
|
+
encode_kwargs: Additional keyword arguments passed to the models `encode` and `load_data` methods;
|
|
290
295
|
raise_error: If True, raise an error if the task fails. If False, return an empty list.
|
|
291
296
|
cache: The cache to use for loading the results. If None, then no cache will be used. The default cache saved the cache in the
|
|
292
297
|
`~/.cache/mteb` directory. It can be overridden by setting the `MTEB_CACHE` environment variable to a different directory or by directly
|
|
@@ -298,10 +303,11 @@ def evaluate(
|
|
|
298
303
|
changed.
|
|
299
304
|
- "only-cache": Only load the results from the cache folder and do not run the task. Useful if you just want to load the results from the
|
|
300
305
|
cache.
|
|
301
|
-
prediction_folder: Optional folder in which to save model predictions for the task. Predictions of the tasks will be
|
|
306
|
+
prediction_folder: Optional folder in which to save model predictions for the task. Predictions of the tasks will be saved in `prediction_folder/{task_name}_predictions.json`
|
|
302
307
|
show_progress_bar: Whether to show a progress bar when running the evaluation. Default is True. Setting this to False will also set the
|
|
303
308
|
`encode_kwargs['show_progress_bar']` to False if encode_kwargs is unspecified.
|
|
304
309
|
public_only: Run only public tasks. If None, it will attempt to run the private task.
|
|
310
|
+
num_proc: Number of processes to use during data loading and transformation. Defaults to 1.
|
|
305
311
|
|
|
306
312
|
Returns:
|
|
307
313
|
The results of the evaluation.
|
|
@@ -342,7 +348,7 @@ def evaluate(
|
|
|
342
348
|
|
|
343
349
|
# AbsTaskAggregate is a special case where we have to run multiple tasks and combine the results
|
|
344
350
|
if isinstance(tasks, AbsTaskAggregate):
|
|
345
|
-
aggregated_task = cast(AbsTaskAggregate, tasks)
|
|
351
|
+
aggregated_task = cast("AbsTaskAggregate", tasks)
|
|
346
352
|
results = evaluate(
|
|
347
353
|
model,
|
|
348
354
|
aggregated_task.metadata.tasks,
|
|
@@ -354,8 +360,12 @@ def evaluate(
|
|
|
354
360
|
prediction_folder=prediction_folder,
|
|
355
361
|
show_progress_bar=show_progress_bar,
|
|
356
362
|
public_only=public_only,
|
|
363
|
+
num_proc=num_proc,
|
|
357
364
|
)
|
|
358
365
|
combined_results = aggregated_task.combine_task_results(results.task_results)
|
|
366
|
+
if cache:
|
|
367
|
+
cache.save_to_cache(combined_results, meta)
|
|
368
|
+
|
|
359
369
|
return ModelResult(
|
|
360
370
|
model_name=results.model_name,
|
|
361
371
|
model_revision=results.model_revision,
|
|
@@ -365,7 +375,7 @@ def evaluate(
|
|
|
365
375
|
if isinstance(tasks, AbsTask):
|
|
366
376
|
task = tasks
|
|
367
377
|
else:
|
|
368
|
-
tasks = cast(Iterable[AbsTask], tasks)
|
|
378
|
+
tasks = cast("Iterable[AbsTask]", tasks)
|
|
369
379
|
evaluate_results = []
|
|
370
380
|
exceptions = []
|
|
371
381
|
tasks_tqdm = tqdm(
|
|
@@ -386,6 +396,7 @@ def evaluate(
|
|
|
386
396
|
prediction_folder=prediction_folder,
|
|
387
397
|
show_progress_bar=False,
|
|
388
398
|
public_only=public_only,
|
|
399
|
+
num_proc=num_proc,
|
|
389
400
|
)
|
|
390
401
|
evaluate_results.extend(_res.task_results)
|
|
391
402
|
if _res.exceptions:
|
|
@@ -465,6 +476,7 @@ def evaluate(
|
|
|
465
476
|
encode_kwargs=encode_kwargs,
|
|
466
477
|
prediction_folder=prediction_folder,
|
|
467
478
|
public_only=public_only,
|
|
479
|
+
num_proc=num_proc,
|
|
468
480
|
)
|
|
469
481
|
except Exception as e:
|
|
470
482
|
logger.error(
|
|
@@ -480,6 +492,7 @@ def evaluate(
|
|
|
480
492
|
encode_kwargs=encode_kwargs,
|
|
481
493
|
prediction_folder=prediction_folder,
|
|
482
494
|
public_only=public_only,
|
|
495
|
+
num_proc=num_proc,
|
|
483
496
|
)
|
|
484
497
|
logger.info(f"✓ Finished evaluation for {task.metadata.name}")
|
|
485
498
|
|
mteb/filter_tasks.py
CHANGED
|
@@ -1,19 +1,24 @@
|
|
|
1
1
|
"""This script contains functions that are used to get an overview of the MTEB benchmark."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import logging
|
|
4
|
-
from
|
|
5
|
-
from typing import overload
|
|
6
|
+
from typing import TYPE_CHECKING, overload
|
|
6
7
|
|
|
7
|
-
from mteb.abstasks import (
|
|
8
|
-
AbsTask,
|
|
9
|
-
)
|
|
10
8
|
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
11
|
-
from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
|
|
12
9
|
from mteb.languages import (
|
|
13
10
|
ISO_TO_LANGUAGE,
|
|
14
11
|
ISO_TO_SCRIPT,
|
|
15
12
|
)
|
|
16
|
-
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from collections.abc import Iterable, Sequence
|
|
16
|
+
|
|
17
|
+
from mteb.abstasks import (
|
|
18
|
+
AbsTask,
|
|
19
|
+
)
|
|
20
|
+
from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
|
|
21
|
+
from mteb.types import Modalities
|
|
17
22
|
|
|
18
23
|
logger = logging.getLogger(__name__)
|
|
19
24
|
|
mteb/get_tasks.py
CHANGED
|
@@ -1,20 +1,25 @@
|
|
|
1
1
|
"""This script contains functions that are used to get an overview of the MTEB benchmark."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import difflib
|
|
4
6
|
import logging
|
|
5
7
|
import warnings
|
|
6
8
|
from collections import Counter, defaultdict
|
|
7
|
-
from
|
|
8
|
-
from typing import Any
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
9
10
|
|
|
10
11
|
import pandas as pd
|
|
11
12
|
|
|
12
13
|
from mteb.abstasks import (
|
|
13
14
|
AbsTask,
|
|
14
15
|
)
|
|
15
|
-
from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
|
|
16
16
|
from mteb.filter_tasks import filter_tasks
|
|
17
|
-
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import Iterable, Sequence
|
|
20
|
+
|
|
21
|
+
from mteb.abstasks.task_metadata import TaskCategory, TaskDomain, TaskType
|
|
22
|
+
from mteb.types import Modalities
|
|
18
23
|
|
|
19
24
|
logger = logging.getLogger(__name__)
|
|
20
25
|
|
|
@@ -1,10 +1,15 @@
|
|
|
1
|
-
from
|
|
2
|
-
from dataclasses import dataclass
|
|
1
|
+
from __future__ import annotations
|
|
3
2
|
|
|
4
|
-
from
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
5
|
|
|
6
6
|
from mteb.languages.check_language_code import check_language_code
|
|
7
7
|
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from collections.abc import Iterable, Sequence
|
|
10
|
+
|
|
11
|
+
from typing_extensions import Self
|
|
12
|
+
|
|
8
13
|
|
|
9
14
|
@dataclass
|
|
10
15
|
class LanguageScripts:
|
mteb/leaderboard/app.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import itertools
|
|
2
4
|
import json
|
|
3
5
|
import logging
|
|
@@ -5,15 +7,14 @@ import tempfile
|
|
|
5
7
|
import time
|
|
6
8
|
import warnings
|
|
7
9
|
from pathlib import Path
|
|
8
|
-
from typing import Literal, get_args
|
|
10
|
+
from typing import TYPE_CHECKING, Literal, get_args
|
|
9
11
|
from urllib.parse import urlencode
|
|
10
12
|
|
|
11
13
|
import cachetools
|
|
12
14
|
import gradio as gr
|
|
13
|
-
import pandas as pd
|
|
15
|
+
import pandas as pd # noqa: TC002 # gradio tries to validate typehints
|
|
14
16
|
|
|
15
17
|
import mteb
|
|
16
|
-
from mteb import BenchmarkResults
|
|
17
18
|
from mteb.benchmarks.benchmark import RtebBenchmark
|
|
18
19
|
from mteb.cache import ResultCache
|
|
19
20
|
from mteb.leaderboard.benchmark_selector import (
|
|
@@ -31,6 +32,9 @@ from mteb.leaderboard.table import (
|
|
|
31
32
|
from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
|
|
32
33
|
from mteb.models.model_meta import MODEL_TYPES
|
|
33
34
|
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
from mteb import BenchmarkResults
|
|
37
|
+
|
|
34
38
|
logger = logging.getLogger(__name__)
|
|
35
39
|
|
|
36
40
|
|
|
@@ -546,7 +550,10 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
546
550
|
|
|
547
551
|
logger.info("Step 7/7: Building Gradio interface and callbacks...")
|
|
548
552
|
interface_start = time.time()
|
|
549
|
-
with gr.Blocks(
|
|
553
|
+
with gr.Blocks(
|
|
554
|
+
title="MTEB Leaderboard",
|
|
555
|
+
fill_width=True,
|
|
556
|
+
) as demo:
|
|
550
557
|
with gr.Sidebar(
|
|
551
558
|
position="left",
|
|
552
559
|
label="Benchmark Selection and Customization",
|
mteb/leaderboard/table.py
CHANGED
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
1
5
|
import gradio as gr
|
|
2
6
|
import matplotlib.pyplot as plt
|
|
3
7
|
import numpy as np
|
|
@@ -5,8 +9,9 @@ import pandas as pd
|
|
|
5
9
|
from matplotlib.colors import LinearSegmentedColormap
|
|
6
10
|
from pandas.api.types import is_numeric_dtype
|
|
7
11
|
|
|
8
|
-
|
|
9
|
-
from mteb.
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from mteb.benchmarks.benchmark import Benchmark
|
|
14
|
+
from mteb.results.benchmark_results import BenchmarkResults
|
|
10
15
|
|
|
11
16
|
|
|
12
17
|
def _borda_count(scores: pd.Series) -> pd.Series:
|
mteb/load_results.py
CHANGED
|
@@ -1,13 +1,19 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import logging
|
|
3
5
|
import sys
|
|
4
|
-
from
|
|
5
|
-
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
6
7
|
|
|
7
8
|
from mteb.abstasks.abstask import AbsTask
|
|
8
9
|
from mteb.models.model_meta import ModelMeta
|
|
9
10
|
from mteb.results import BenchmarkResults, ModelResult, TaskResult
|
|
10
|
-
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Iterable, Sequence
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from mteb.types import ModelName, Revision
|
|
11
17
|
|
|
12
18
|
if sys.version_info >= (3, 13):
|
|
13
19
|
from warnings import deprecated
|
mteb/models/abs_encoder.py
CHANGED
|
@@ -1,14 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import warnings
|
|
3
5
|
from abc import ABC, abstractmethod
|
|
4
|
-
from
|
|
5
|
-
from typing import Any, Literal, cast, get_args, overload
|
|
6
|
-
|
|
7
|
-
from torch.utils.data import DataLoader
|
|
8
|
-
from typing_extensions import Unpack
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal, cast, get_args, overload
|
|
9
7
|
|
|
10
8
|
import mteb
|
|
11
|
-
from mteb.abstasks.task_metadata import
|
|
9
|
+
from mteb.abstasks.task_metadata import TaskType
|
|
12
10
|
from mteb.similarity_functions import (
|
|
13
11
|
cos_sim,
|
|
14
12
|
dot_score,
|
|
@@ -18,13 +16,25 @@ from mteb.similarity_functions import (
|
|
|
18
16
|
pairwise_max_sim,
|
|
19
17
|
)
|
|
20
18
|
from mteb.types import (
|
|
21
|
-
Array,
|
|
22
|
-
BatchedInput,
|
|
23
|
-
EncodeKwargs,
|
|
24
19
|
PromptType,
|
|
25
20
|
)
|
|
26
21
|
|
|
27
|
-
from .model_meta import
|
|
22
|
+
from .model_meta import ScoringFunction
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from collections.abc import Callable, Sequence
|
|
26
|
+
|
|
27
|
+
from torch.utils.data import DataLoader
|
|
28
|
+
from typing_extensions import Unpack
|
|
29
|
+
|
|
30
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
31
|
+
from mteb.types import (
|
|
32
|
+
Array,
|
|
33
|
+
BatchedInput,
|
|
34
|
+
EncodeKwargs,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
from .model_meta import ModelMeta
|
|
28
38
|
|
|
29
39
|
logger = logging.getLogger(__name__)
|
|
30
40
|
|
|
@@ -314,7 +324,7 @@ class AbsEncoder(ABC):
|
|
|
314
324
|
):
|
|
315
325
|
arr = self.model.similarity(embeddings1, embeddings2)
|
|
316
326
|
# We assume that the model returns an Array-like object:
|
|
317
|
-
arr = cast(Array, arr)
|
|
327
|
+
arr = cast("Array", arr)
|
|
318
328
|
return arr
|
|
319
329
|
return cos_sim(embeddings1, embeddings2)
|
|
320
330
|
if self.mteb_model_meta.similarity_fn_name is ScoringFunction.COSINE:
|
|
@@ -352,7 +362,7 @@ class AbsEncoder(ABC):
|
|
|
352
362
|
):
|
|
353
363
|
arr = self.model.similarity_pairwise(embeddings1, embeddings2)
|
|
354
364
|
# We assume that the model returns an Array-like object:
|
|
355
|
-
arr = cast(Array, arr)
|
|
365
|
+
arr = cast("Array", arr)
|
|
356
366
|
return arr
|
|
357
367
|
return pairwise_cos_sim(embeddings1, embeddings2)
|
|
358
368
|
if self.mteb_model_meta.similarity_fn_name is ScoringFunction.COSINE:
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from typing import Any, Protocol, runtime_checkable
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
|
|
5
4
|
|
|
6
|
-
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
@runtime_checkable
|
|
@@ -1,6 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import hashlib
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from collections.abc import Mapping
|
|
8
|
+
|
|
9
|
+
from PIL import Image
|
|
4
10
|
|
|
5
11
|
|
|
6
12
|
def _hash_item(item: Mapping[str, Any]) -> str:
|
|
@@ -10,8 +16,6 @@ def _hash_item(item: Mapping[str, Any]) -> str:
|
|
|
10
16
|
item_hash = hashlib.sha256(item_text.encode()).hexdigest()
|
|
11
17
|
|
|
12
18
|
if "image" in item:
|
|
13
|
-
from PIL import Image
|
|
14
|
-
|
|
15
19
|
image: Image.Image = item["image"]
|
|
16
20
|
item_hash += hashlib.sha256(image.tobytes()).hexdigest()
|
|
17
21
|
|
|
@@ -1,16 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import logging
|
|
3
5
|
import warnings
|
|
4
6
|
from pathlib import Path
|
|
5
|
-
from typing import Any
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
6
8
|
|
|
7
9
|
import numpy as np
|
|
8
10
|
|
|
9
11
|
from mteb._requires_package import requires_package
|
|
10
|
-
from mteb.types import BatchedInput
|
|
11
12
|
|
|
12
13
|
from ._hash_utils import _hash_item
|
|
13
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
import faiss
|
|
17
|
+
|
|
18
|
+
from mteb.types import BatchedInput
|
|
19
|
+
|
|
14
20
|
logger = logging.getLogger(__name__)
|
|
15
21
|
|
|
16
22
|
|
|
@@ -24,7 +30,6 @@ class FaissCache:
|
|
|
24
30
|
"FAISS-based vector cache",
|
|
25
31
|
install_instruction="pip install mteb[faiss-cpu]",
|
|
26
32
|
)
|
|
27
|
-
import faiss
|
|
28
33
|
|
|
29
34
|
self.directory = Path(directory)
|
|
30
35
|
self.directory.mkdir(parents=True, exist_ok=True)
|
|
@@ -1,21 +1,26 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
from pathlib import Path
|
|
3
|
-
from typing import Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
import torch
|
|
7
9
|
from datasets import Dataset
|
|
8
|
-
from torch.utils.data import DataLoader
|
|
9
10
|
|
|
10
11
|
from mteb._create_dataloaders import create_dataloader
|
|
11
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
12
|
-
from mteb.models.cache_wrappers.cache_backend_protocol import (
|
|
13
|
-
CacheBackendProtocol,
|
|
14
|
-
)
|
|
15
12
|
from mteb.models.cache_wrappers.cache_backends.numpy_cache import NumpyCache
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
from
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from torch.utils.data import DataLoader
|
|
16
|
+
|
|
17
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
18
|
+
from mteb.models.cache_wrappers.cache_backend_protocol import (
|
|
19
|
+
CacheBackendProtocol,
|
|
20
|
+
)
|
|
21
|
+
from mteb.models.model_meta import ModelMeta
|
|
22
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
23
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
19
24
|
|
|
20
25
|
logger = logging.getLogger(__name__)
|
|
21
26
|
|
mteb/models/get_model_meta.py
CHANGED
|
@@ -1,15 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import difflib
|
|
2
4
|
import logging
|
|
3
|
-
from
|
|
4
|
-
from typing import Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
5
6
|
|
|
6
|
-
from mteb.abstasks import AbsTask
|
|
7
7
|
from mteb.models import (
|
|
8
8
|
ModelMeta,
|
|
9
|
-
MTEBModels,
|
|
10
9
|
)
|
|
11
10
|
from mteb.models.model_implementations import MODEL_REGISTRY
|
|
12
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Iterable
|
|
14
|
+
|
|
15
|
+
from mteb.abstasks import AbsTask
|
|
16
|
+
from mteb.models import (
|
|
17
|
+
MTEBModels,
|
|
18
|
+
)
|
|
19
|
+
|
|
13
20
|
logger = logging.getLogger(__name__)
|
|
14
21
|
|
|
15
22
|
|
|
@@ -116,7 +123,10 @@ def get_model(
|
|
|
116
123
|
|
|
117
124
|
|
|
118
125
|
def get_model_meta(
|
|
119
|
-
model_name: str,
|
|
126
|
+
model_name: str,
|
|
127
|
+
revision: str | None = None,
|
|
128
|
+
fetch_from_hf: bool = True,
|
|
129
|
+
fill_missing: bool = False,
|
|
120
130
|
) -> ModelMeta:
|
|
121
131
|
"""A function to fetch a model metadata object by name.
|
|
122
132
|
|
|
@@ -124,6 +134,7 @@ def get_model_meta(
|
|
|
124
134
|
model_name: Name of the model to fetch
|
|
125
135
|
revision: Revision of the model to fetch
|
|
126
136
|
fetch_from_hf: Whether to fetch the model from HuggingFace Hub if not found in the registry
|
|
137
|
+
fill_missing: Computes missing attributes from the metadata including number of parameters and memory usage.
|
|
127
138
|
|
|
128
139
|
Returns:
|
|
129
140
|
A model metadata object
|
|
@@ -135,10 +146,25 @@ def get_model_meta(
|
|
|
135
146
|
raise ValueError(
|
|
136
147
|
f"Model revision {revision} not found for model {model_name}. Expected {model_meta.revision}."
|
|
137
148
|
)
|
|
149
|
+
|
|
150
|
+
if fill_missing and fetch_from_hf:
|
|
151
|
+
original_meta_dict = model_meta.model_dump()
|
|
152
|
+
new_meta = ModelMeta.from_hub(model_name)
|
|
153
|
+
new_meta_dict = new_meta.model_dump(exclude_none=True)
|
|
154
|
+
|
|
155
|
+
updates = {
|
|
156
|
+
k: v
|
|
157
|
+
for k, v in new_meta_dict.items()
|
|
158
|
+
if original_meta_dict.get(k) is None
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if updates:
|
|
162
|
+
return model_meta.model_copy(update=updates)
|
|
138
163
|
return model_meta
|
|
164
|
+
|
|
139
165
|
if fetch_from_hf:
|
|
140
166
|
logger.info(
|
|
141
|
-
"Model not found in model registry. Attempting to extract metadata by loading the model ({model_name}) using HuggingFace."
|
|
167
|
+
f"Model not found in model registry. Attempting to extract metadata by loading the model ({model_name}) using HuggingFace."
|
|
142
168
|
)
|
|
143
169
|
meta = ModelMeta.from_hub(model_name, revision)
|
|
144
170
|
return meta
|
mteb/models/instruct_wrapper.py
CHANGED
|
@@ -1,16 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
4
5
|
|
|
5
6
|
import torch
|
|
6
|
-
from torch.utils.data import DataLoader
|
|
7
7
|
|
|
8
8
|
from mteb._requires_package import requires_package
|
|
9
|
-
from mteb.
|
|
10
|
-
from mteb.types import Array, BatchedInput, PromptType
|
|
9
|
+
from mteb.types import PromptType
|
|
11
10
|
|
|
12
11
|
from .abs_encoder import AbsEncoder
|
|
13
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
|
|
16
|
+
from torch.utils.data import DataLoader
|
|
17
|
+
|
|
18
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
19
|
+
from mteb.types import Array, BatchedInput
|
|
20
|
+
|
|
21
|
+
|
|
14
22
|
logger = logging.getLogger(__name__)
|
|
15
23
|
|
|
16
24
|
|