mteb 2.7.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +63 -14
- mteb/_evaluators/any_sts_evaluator.py +12 -5
- mteb/_evaluators/clustering_evaluator.py +12 -4
- mteb/_evaluators/evaluator.py +11 -5
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +14 -5
- mteb/_evaluators/pair_classification_evaluator.py +13 -5
- mteb/_evaluators/retrieval_evaluator.py +22 -13
- mteb/_evaluators/retrieval_metrics.py +9 -3
- mteb/_evaluators/sklearn_evaluator.py +20 -11
- mteb/_evaluators/text/bitext_mining_evaluator.py +10 -3
- mteb/_evaluators/text/summarization_evaluator.py +10 -4
- mteb/_evaluators/zeroshot_classification_evaluator.py +12 -3
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +8 -2
- mteb/abstasks/_data_filter/task_pipelines.py +7 -2
- mteb/abstasks/_statistics_calculation.py +6 -4
- mteb/abstasks/abstask.py +48 -21
- mteb/abstasks/aggregate_task_metadata.py +20 -9
- mteb/abstasks/aggregated_task.py +15 -8
- mteb/abstasks/classification.py +25 -9
- mteb/abstasks/clustering.py +23 -10
- mteb/abstasks/clustering_legacy.py +22 -8
- mteb/abstasks/image/image_text_pair_classification.py +23 -9
- mteb/abstasks/multilabel_classification.py +13 -5
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +14 -6
- mteb/abstasks/retrieval.py +56 -30
- mteb/abstasks/retrieval_dataset_loaders.py +48 -37
- mteb/abstasks/sts.py +29 -13
- mteb/abstasks/task_metadata.py +17 -8
- mteb/abstasks/text/bitext_mining.py +23 -12
- mteb/abstasks/text/reranking.py +2 -2
- mteb/abstasks/text/summarization.py +19 -8
- mteb/abstasks/zeroshot_classification.py +23 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +11 -1
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +41 -2
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/cache.py +10 -5
- mteb/cli/_display_tasks.py +9 -3
- mteb/cli/build_cli.py +5 -2
- mteb/cli/generate_model_card.py +9 -2
- mteb/deprecated_evaluator.py +16 -12
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/evaluate.py +33 -20
- mteb/filter_tasks.py +12 -7
- mteb/get_tasks.py +9 -4
- mteb/languages/language_scripts.py +8 -3
- mteb/leaderboard/app.py +11 -4
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +9 -3
- mteb/models/abs_encoder.py +22 -12
- mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
- mteb/models/cache_wrappers/cache_wrapper.py +14 -9
- mteb/models/get_model_meta.py +32 -6
- mteb/models/instruct_wrapper.py +13 -5
- mteb/models/model_implementations/align_models.py +10 -4
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +2 -0
- mteb/models/model_implementations/ara_models.py +1 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +40 -1
- mteb/models/model_implementations/bica_model.py +1 -0
- mteb/models/model_implementations/blip2_models.py +11 -4
- mteb/models/model_implementations/blip_models.py +17 -4
- mteb/models/model_implementations/bm25.py +24 -14
- mteb/models/model_implementations/bmretriever_models.py +10 -2
- mteb/models/model_implementations/cadet_models.py +1 -0
- mteb/models/model_implementations/cde_models.py +11 -5
- mteb/models/model_implementations/clip_models.py +12 -4
- mteb/models/model_implementations/clips_models.py +3 -0
- mteb/models/model_implementations/codefuse_models.py +5 -0
- mteb/models/model_implementations/codesage_models.py +3 -0
- mteb/models/model_implementations/cohere_models.py +14 -4
- mteb/models/model_implementations/cohere_v.py +14 -4
- mteb/models/model_implementations/colpali_models.py +7 -3
- mteb/models/model_implementations/colqwen_models.py +17 -31
- mteb/models/model_implementations/colsmol_models.py +3 -1
- mteb/models/model_implementations/conan_models.py +11 -4
- mteb/models/model_implementations/dino_models.py +28 -4
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +9 -0
- mteb/models/model_implementations/e5_v.py +10 -4
- mteb/models/model_implementations/eagerworks_models.py +11 -4
- mteb/models/model_implementations/emillykkejensen_models.py +3 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +1 -0
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +9 -0
- mteb/models/model_implementations/facebookai.py +2 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +7 -3
- mteb/models/model_implementations/google_models.py +15 -4
- mteb/models/model_implementations/granite_vision_embedding_models.py +7 -5
- mteb/models/model_implementations/gritlm_models.py +3 -0
- mteb/models/model_implementations/gte_models.py +9 -0
- mteb/models/model_implementations/hinvec_models.py +6 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +14 -5
- mteb/models/model_implementations/jina_clip.py +10 -4
- mteb/models/model_implementations/jina_models.py +17 -5
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +1 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
- mteb/models/model_implementations/kfst.py +1 -0
- mteb/models/model_implementations/kowshik24_models.py +1 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +7 -1
- mteb/models/model_implementations/listconranker.py +10 -4
- mteb/models/model_implementations/llm2clip_models.py +12 -4
- mteb/models/model_implementations/llm2vec_models.py +20 -6
- mteb/models/model_implementations/mcinext_models.py +8 -2
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +63 -0
- mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +11 -4
- mteb/models/model_implementations/mod_models.py +2 -1
- mteb/models/model_implementations/model2vec_models.py +23 -4
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/nbailab.py +3 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +13 -5
- mteb/models/model_implementations/nomic_models.py +17 -4
- mteb/models/model_implementations/nomic_models_vision.py +5 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +9 -3
- mteb/models/model_implementations/nvidia_models.py +15 -4
- mteb/models/model_implementations/octen_models.py +3 -1
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +17 -4
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +15 -4
- mteb/models/model_implementations/ops_moa_models.py +9 -2
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
- mteb/models/model_implementations/pawan_models.py +1 -0
- mteb/models/model_implementations/piccolo_models.py +2 -0
- mteb/models/model_implementations/promptriever_models.py +16 -6
- mteb/models/model_implementations/pylate_models.py +32 -13
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +11 -1
- mteb/models/model_implementations/qzhou_models.py +2 -0
- mteb/models/model_implementations/random_baseline.py +4 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -0
- mteb/models/model_implementations/reasonir_model.py +65 -0
- mteb/models/model_implementations/repllama_models.py +15 -6
- mteb/models/model_implementations/rerankers_custom.py +13 -4
- mteb/models/model_implementations/rerankers_monot5_based.py +24 -4
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +20 -0
- mteb/models/model_implementations/ruri_models.py +10 -0
- mteb/models/model_implementations/salesforce_models.py +10 -1
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +6 -2
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +18 -0
- mteb/models/model_implementations/shuu_model.py +1 -0
- mteb/models/model_implementations/siglip_models.py +19 -4
- mteb/models/model_implementations/slm_models.py +7 -4
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +2 -0
- mteb/models/model_implementations/text2vec_models.py +3 -0
- mteb/models/model_implementations/ua_sentence_models.py +1 -0
- mteb/models/model_implementations/uae_models.py +10 -4
- mteb/models/model_implementations/vdr_models.py +8 -1
- mteb/models/model_implementations/vi_vn_models.py +6 -0
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +11 -4
- mteb/models/model_implementations/voyage_models.py +52 -4
- mteb/models/model_implementations/voyage_v.py +11 -6
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +1 -0
- mteb/models/model_implementations/yuan_models_en.py +2 -1
- mteb/models/model_meta.py +47 -9
- mteb/models/models_protocols.py +23 -18
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
- mteb/models/search_wrappers.py +31 -12
- mteb/models/sentence_transformer_wrapper.py +4 -3
- mteb/models/vllm_wrapper.py +8 -6
- mteb/results/benchmark_results.py +22 -17
- mteb/results/model_result.py +21 -15
- mteb/results/task_result.py +32 -16
- mteb/similarity_functions.py +8 -2
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +4 -4
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +42 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +1 -1
- mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/_encoder_io.py +1 -1
- mteb/types/statistics.py +9 -2
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/METADATA +1 -1
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/RECORD +486 -465
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
mteb/abstasks/text/reranking.py
CHANGED
|
@@ -34,7 +34,7 @@ class AbsTaskReranking(AbsTaskRetrieval):
|
|
|
34
34
|
For dataformat and other information, see [AbsTaskRetrieval][mteb.abstasks.retrieval.AbsTaskRetrieval].
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
|
-
def load_data(self) -> None:
|
|
37
|
+
def load_data(self, num_proc: int = 1, **kwargs) -> None:
|
|
38
38
|
"""Load the dataset."""
|
|
39
39
|
if self.data_loaded:
|
|
40
40
|
return
|
|
@@ -43,7 +43,7 @@ class AbsTaskReranking(AbsTaskRetrieval):
|
|
|
43
43
|
self.transform_old_dataset_format()
|
|
44
44
|
else:
|
|
45
45
|
# use AbsTaskRetrieval default to load the data
|
|
46
|
-
return super().load_data()
|
|
46
|
+
return super().load_data(num_proc=num_proc)
|
|
47
47
|
|
|
48
48
|
def _process_example(self, example: dict, split: str, query_idx: int) -> dict:
|
|
49
49
|
"""Process a single example from the dataset.
|
|
@@ -1,24 +1,34 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
3
5
|
|
|
4
6
|
import numpy as np
|
|
5
|
-
from datasets import Dataset
|
|
6
7
|
|
|
7
8
|
from mteb._evaluators import SummarizationEvaluator
|
|
8
|
-
from mteb._evaluators.text.summarization_evaluator import SummarizationMetrics
|
|
9
9
|
from mteb.abstasks._statistics_calculation import (
|
|
10
10
|
calculate_score_statistics,
|
|
11
11
|
calculate_text_statistics,
|
|
12
12
|
)
|
|
13
13
|
from mteb.abstasks.abstask import AbsTask
|
|
14
|
-
from mteb.models import EncoderProtocol
|
|
15
|
-
from mteb.types import EncodeKwargs
|
|
14
|
+
from mteb.models import EncoderProtocol
|
|
16
15
|
from mteb.types.statistics import (
|
|
17
|
-
ScoreStatistics,
|
|
18
16
|
SplitDescriptiveStatistics,
|
|
19
|
-
TextStatistics,
|
|
20
17
|
)
|
|
21
18
|
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
from datasets import Dataset
|
|
23
|
+
|
|
24
|
+
from mteb._evaluators.text.summarization_evaluator import SummarizationMetrics
|
|
25
|
+
from mteb.models import MTEBModels
|
|
26
|
+
from mteb.types import EncodeKwargs
|
|
27
|
+
from mteb.types.statistics import (
|
|
28
|
+
ScoreStatistics,
|
|
29
|
+
TextStatistics,
|
|
30
|
+
)
|
|
31
|
+
|
|
22
32
|
logger = logging.getLogger(__name__)
|
|
23
33
|
|
|
24
34
|
|
|
@@ -84,6 +94,7 @@ class AbsTaskSummarization(AbsTask):
|
|
|
84
94
|
hf_subset: str,
|
|
85
95
|
encode_kwargs: EncodeKwargs,
|
|
86
96
|
prediction_folder: Path | None = None,
|
|
97
|
+
num_proc: int = 1,
|
|
87
98
|
**kwargs,
|
|
88
99
|
) -> SummarizationMetrics:
|
|
89
100
|
if not isinstance(model, EncoderProtocol):
|
|
@@ -105,7 +116,7 @@ class AbsTaskSummarization(AbsTask):
|
|
|
105
116
|
hf_subset=hf_subset,
|
|
106
117
|
**kwargs,
|
|
107
118
|
)
|
|
108
|
-
scores = evaluator(model, encode_kwargs=encode_kwargs)
|
|
119
|
+
scores = evaluator(model, encode_kwargs=encode_kwargs, num_proc=num_proc)
|
|
109
120
|
if prediction_folder:
|
|
110
121
|
self._save_task_predictions(
|
|
111
122
|
scores,
|
|
@@ -1,19 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from typing import TypedDict
|
|
4
|
+
from typing import TYPE_CHECKING, TypedDict
|
|
4
5
|
|
|
5
6
|
import torch
|
|
6
7
|
from datasets import Dataset
|
|
7
8
|
from sklearn import metrics
|
|
8
9
|
|
|
9
10
|
from mteb._evaluators import ZeroShotClassificationEvaluator
|
|
10
|
-
from mteb.models import EncoderProtocol
|
|
11
|
-
from mteb.types import EncodeKwargs
|
|
11
|
+
from mteb.models import EncoderProtocol
|
|
12
12
|
from mteb.types.statistics import (
|
|
13
|
-
ImageStatistics,
|
|
14
|
-
LabelStatistics,
|
|
15
13
|
SplitDescriptiveStatistics,
|
|
16
|
-
TextStatistics,
|
|
17
14
|
)
|
|
18
15
|
|
|
19
16
|
from ._statistics_calculation import (
|
|
@@ -23,6 +20,17 @@ from ._statistics_calculation import (
|
|
|
23
20
|
)
|
|
24
21
|
from .abstask import AbsTask
|
|
25
22
|
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
from mteb.models import MTEBModels
|
|
27
|
+
from mteb.types import EncodeKwargs
|
|
28
|
+
from mteb.types.statistics import (
|
|
29
|
+
ImageStatistics,
|
|
30
|
+
LabelStatistics,
|
|
31
|
+
TextStatistics,
|
|
32
|
+
)
|
|
33
|
+
|
|
26
34
|
logger = logging.getLogger(__name__)
|
|
27
35
|
|
|
28
36
|
|
|
@@ -119,6 +127,7 @@ class AbsTaskZeroShotClassification(AbsTask):
|
|
|
119
127
|
hf_subset: str,
|
|
120
128
|
encode_kwargs: EncodeKwargs,
|
|
121
129
|
prediction_folder: Path | None = None,
|
|
130
|
+
num_proc: int = 1,
|
|
122
131
|
**kwargs,
|
|
123
132
|
) -> ZeroShotClassificationMetrics:
|
|
124
133
|
if not isinstance(model, EncoderProtocol):
|
|
@@ -137,7 +146,11 @@ class AbsTaskZeroShotClassification(AbsTask):
|
|
|
137
146
|
hf_subset=hf_subset,
|
|
138
147
|
**kwargs,
|
|
139
148
|
)
|
|
140
|
-
probs = evaluator(
|
|
149
|
+
probs = evaluator(
|
|
150
|
+
model,
|
|
151
|
+
encode_kwargs=encode_kwargs,
|
|
152
|
+
num_proc=num_proc,
|
|
153
|
+
)
|
|
141
154
|
|
|
142
155
|
if prediction_folder:
|
|
143
156
|
self._save_task_predictions(
|
|
@@ -162,13 +175,14 @@ class AbsTaskZeroShotClassification(AbsTask):
|
|
|
162
175
|
accuracy=metrics.accuracy_score(labels, predictions),
|
|
163
176
|
)
|
|
164
177
|
|
|
165
|
-
def _push_dataset_to_hub(self, repo_name: str) -> None:
|
|
178
|
+
def _push_dataset_to_hub(self, repo_name: str, num_proc: int = 1) -> None:
|
|
166
179
|
self._upload_dataset_to_hub(
|
|
167
180
|
repo_name,
|
|
168
181
|
[
|
|
169
182
|
self.input_column_name,
|
|
170
183
|
self.label_column_name,
|
|
171
184
|
],
|
|
185
|
+
num_proc=num_proc,
|
|
172
186
|
)
|
|
173
187
|
labels_dataset = Dataset.from_dict({"labels": self.get_candidate_labels()})
|
|
174
188
|
labels_dataset.push_to_hub(repo_name, config_name="labels")
|
mteb/benchmarks/_create_table.py
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import re
|
|
2
4
|
from collections import defaultdict
|
|
3
|
-
from typing import Literal
|
|
5
|
+
from typing import TYPE_CHECKING, Literal
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
import pandas as pd
|
|
7
9
|
|
|
8
10
|
import mteb
|
|
9
11
|
from mteb.get_tasks import get_task, get_tasks
|
|
10
|
-
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from mteb.results.benchmark_results import BenchmarkResults
|
|
11
15
|
|
|
12
16
|
|
|
13
17
|
def _borda_count(scores: pd.Series) -> pd.Series:
|
|
@@ -115,7 +119,6 @@ def _create_summary_table_from_benchmark_results(
|
|
|
115
119
|
|
|
116
120
|
# Build joint table
|
|
117
121
|
joint_table = mean_per_type.copy()
|
|
118
|
-
joint_table = joint_table.drop(models_to_remove, axis=0)
|
|
119
122
|
joint_table.insert(0, "mean", overall_mean)
|
|
120
123
|
joint_table.insert(1, "mean_by_task_type", typed_mean)
|
|
121
124
|
joint_table["borda_rank"] = _get_borda_rank(per_task)
|
|
@@ -303,6 +306,7 @@ def _create_per_language_table_from_benchmark_results(
|
|
|
303
306
|
|
|
304
307
|
def _create_summary_table_mean_public_private(
|
|
305
308
|
benchmark_results: BenchmarkResults,
|
|
309
|
+
exclude_private_from_borda: bool = False,
|
|
306
310
|
) -> pd.DataFrame:
|
|
307
311
|
"""Create summary table from BenchmarkResults.
|
|
308
312
|
|
|
@@ -311,6 +315,7 @@ def _create_summary_table_mean_public_private(
|
|
|
311
315
|
|
|
312
316
|
Args:
|
|
313
317
|
benchmark_results: BenchmarkResults object containing model results
|
|
318
|
+
exclude_private_from_borda: If True, calculate Borda rank using only public tasks
|
|
314
319
|
|
|
315
320
|
Returns:
|
|
316
321
|
DataFrame with model summaries, ready for styling in the leaderboard
|
|
@@ -353,10 +358,13 @@ def _create_summary_table_mean_public_private(
|
|
|
353
358
|
|
|
354
359
|
# Build joint table
|
|
355
360
|
joint_table = mean_per_type.copy()
|
|
356
|
-
joint_table = joint_table.drop(models_to_remove, axis=0)
|
|
357
361
|
joint_table.insert(0, "mean(public)", public_mean)
|
|
358
362
|
joint_table.insert(1, "mean(private)", private_mean)
|
|
359
|
-
|
|
363
|
+
if exclude_private_from_borda:
|
|
364
|
+
borda_per_task = per_task[public_task_name]
|
|
365
|
+
else:
|
|
366
|
+
borda_per_task = per_task
|
|
367
|
+
joint_table["borda_rank"] = _get_borda_rank(borda_per_task)
|
|
360
368
|
joint_table = joint_table.sort_values("borda_rank", ascending=True)
|
|
361
369
|
joint_table = joint_table.reset_index()
|
|
362
370
|
|
|
@@ -476,7 +484,6 @@ def _create_summary_table_mean_subset(
|
|
|
476
484
|
|
|
477
485
|
# Build joint table
|
|
478
486
|
joint_table = mean_per_type.copy()
|
|
479
|
-
joint_table = joint_table.drop(models_to_remove, axis=0)
|
|
480
487
|
joint_table.insert(0, "mean(subset)", overall_subset_mean)
|
|
481
488
|
joint_table["borda_rank"] = _get_borda_rank(per_subset)
|
|
482
489
|
joint_table = joint_table.sort_values("mean(subset)", ascending=False)
|
|
@@ -595,7 +602,6 @@ def _create_summary_table_mean_task_type(
|
|
|
595
602
|
|
|
596
603
|
# Build joint table
|
|
597
604
|
joint_table = mean_per_type.copy()
|
|
598
|
-
joint_table = joint_table.drop(models_to_remove, axis=0)
|
|
599
605
|
joint_table.insert(0, "mean_by_task_type", typed_mean)
|
|
600
606
|
joint_table = joint_table.sort_values("mean_by_task_type", ascending=False)
|
|
601
607
|
joint_table["borda_rank"] = _get_borda_rank(per_task)
|
mteb/benchmarks/benchmark.py
CHANGED
|
@@ -123,9 +123,19 @@ class RtebBenchmark(Benchmark):
|
|
|
123
123
|
_create_summary_table_mean_public_private,
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
-
joint_table = _create_summary_table_mean_public_private(
|
|
126
|
+
joint_table = _create_summary_table_mean_public_private(
|
|
127
|
+
benchmark_results, exclude_private_from_borda=True
|
|
128
|
+
)
|
|
129
|
+
# issue 3902: temporary remove the private column from RTEB summary table
|
|
130
|
+
if "Mean (Private)" in joint_table.columns:
|
|
131
|
+
joint_table = joint_table.drop(columns=["Mean (Private)"])
|
|
127
132
|
# For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
|
|
133
|
+
# but due to 3902, if Private column existed, Mean (Task) was the mean of Public and Private so instead we drop Mean (Task) and rename Mean (Public) to Mean (Task)
|
|
128
134
|
joint_table = joint_table.rename(columns={"Retrieval": "Mean (Task)"})
|
|
135
|
+
if "Mean (Task)" in joint_table.columns:
|
|
136
|
+
joint_table = joint_table.drop(columns=["Mean (Task)"])
|
|
137
|
+
joint_table = joint_table.rename(columns={"Mean (Public)": "Mean (Task)"})
|
|
138
|
+
|
|
129
139
|
return joint_table
|
|
130
140
|
|
|
131
141
|
|
|
@@ -3,6 +3,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
|
|
|
3
3
|
BEIR_NL,
|
|
4
4
|
BRIGHT,
|
|
5
5
|
BRIGHT_LONG,
|
|
6
|
+
BRIGHT_V1_1,
|
|
6
7
|
BUILT_MTEB,
|
|
7
8
|
C_MTEB,
|
|
8
9
|
CHEMTEB,
|
|
@@ -69,6 +70,7 @@ __all__ = [
|
|
|
69
70
|
"BEIR_NL",
|
|
70
71
|
"BRIGHT",
|
|
71
72
|
"BRIGHT_LONG",
|
|
73
|
+
"BRIGHT_V1_1",
|
|
72
74
|
"BUILT_MTEB",
|
|
73
75
|
"CHEMTEB",
|
|
74
76
|
"CHEMTEB_V1_1",
|
|
@@ -1330,6 +1330,46 @@ This is the long version of the benchmark, which only filter longer documents.
|
|
|
1330
1330
|
""",
|
|
1331
1331
|
)
|
|
1332
1332
|
|
|
1333
|
+
BRIGHT_V1_1 = Benchmark(
|
|
1334
|
+
name="BRIGHT(v1.1)",
|
|
1335
|
+
display_name="Reasoning Retrieval",
|
|
1336
|
+
tasks=get_tasks(
|
|
1337
|
+
tasks=[
|
|
1338
|
+
"BrightBiologyRetrieval",
|
|
1339
|
+
"BrightEarthScienceRetrieval",
|
|
1340
|
+
"BrightEconomicsRetrieval",
|
|
1341
|
+
"BrightPsychologyRetrieval",
|
|
1342
|
+
"BrightRoboticsRetrieval",
|
|
1343
|
+
"BrightStackoverflowRetrieval",
|
|
1344
|
+
"BrightSustainableLivingRetrieval",
|
|
1345
|
+
"BrightPonyRetrieval",
|
|
1346
|
+
"BrightLeetcodeRetrieval",
|
|
1347
|
+
"BrightAopsRetrieval",
|
|
1348
|
+
"BrightTheoremQATheoremsRetrieval",
|
|
1349
|
+
"BrightTheoremQAQuestionsRetrieval",
|
|
1350
|
+
"BrightBiologyLongRetrieval",
|
|
1351
|
+
"BrightEarthScienceLongRetrieval",
|
|
1352
|
+
"BrightEconomicsLongRetrieval",
|
|
1353
|
+
"BrightPsychologyLongRetrieval",
|
|
1354
|
+
"BrightRoboticsLongRetrieval",
|
|
1355
|
+
"BrightStackoverflowLongRetrieval",
|
|
1356
|
+
"BrightSustainableLivingLongRetrieval",
|
|
1357
|
+
"BrightPonyLongRetrieval",
|
|
1358
|
+
],
|
|
1359
|
+
),
|
|
1360
|
+
description="v1.1 refactors the BRIGHT into a different tasks and added prompt to individual tasks.",
|
|
1361
|
+
reference="https://brightbenchmark.github.io/",
|
|
1362
|
+
citation=r"""
|
|
1363
|
+
@article{su2024bright,
|
|
1364
|
+
author = {Su, Hongjin and Yen, Howard and Xia, Mengzhou and Shi, Weijia and Muennighoff, Niklas and Wang, Han-yu and Liu, Haisu and Shi, Quan and Siegel, Zachary S and Tang, Michael and others},
|
|
1365
|
+
journal = {arXiv preprint arXiv:2407.12883},
|
|
1366
|
+
title = {Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
|
|
1367
|
+
year = {2024},
|
|
1368
|
+
}
|
|
1369
|
+
""",
|
|
1370
|
+
)
|
|
1371
|
+
|
|
1372
|
+
|
|
1333
1373
|
CODE_RAG = Benchmark(
|
|
1334
1374
|
name="CodeRAG",
|
|
1335
1375
|
tasks=get_tasks(
|
|
@@ -1781,8 +1821,7 @@ BEIR_NL = Benchmark(
|
|
|
1781
1821
|
"TRECCOVID-NL",
|
|
1782
1822
|
],
|
|
1783
1823
|
),
|
|
1784
|
-
description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated "
|
|
1785
|
-
"translation.",
|
|
1824
|
+
description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated translation.",
|
|
1786
1825
|
reference="https://arxiv.org/abs/2412.08329",
|
|
1787
1826
|
contacts=["nikolay-banar"],
|
|
1788
1827
|
citation=r"""
|
|
@@ -10,6 +10,8 @@ RTEB_CITATION = r"""@article{rteb2025,
|
|
|
10
10
|
year = {2025},
|
|
11
11
|
}"""
|
|
12
12
|
|
|
13
|
+
removal_note = "\n\nNote: We have temporarily removed the 'Private' column to read more about this decision out the [announcement](https://github.com/embeddings-benchmark/mteb/issues/3934)."
|
|
14
|
+
|
|
13
15
|
RTEB_MAIN = RtebBenchmark(
|
|
14
16
|
name="RTEB(beta)",
|
|
15
17
|
display_name="RTEB Multilingual",
|
|
@@ -48,7 +50,8 @@ RTEB_MAIN = RtebBenchmark(
|
|
|
48
50
|
"JapaneseLegal1Retrieval",
|
|
49
51
|
],
|
|
50
52
|
),
|
|
51
|
-
description="RTEB (ReTrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across multiple languages. The dataset includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
53
|
+
description="RTEB (ReTrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across multiple languages. The dataset includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
54
|
+
+ removal_note,
|
|
52
55
|
citation=RTEB_CITATION,
|
|
53
56
|
contacts=["fzowl"],
|
|
54
57
|
)
|
|
@@ -83,7 +86,8 @@ RTEB_ENGLISH = RtebBenchmark(
|
|
|
83
86
|
],
|
|
84
87
|
languages=["eng"],
|
|
85
88
|
),
|
|
86
|
-
description="RTEB English is a subset of RTEB containing retrieval tasks in English across legal, finance, code, and healthcare domains. Includes diverse tasks covering specialized domains such as healthcare and finance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
89
|
+
description="RTEB English is a subset of RTEB containing retrieval tasks in English across legal, finance, code, and healthcare domains. Includes diverse tasks covering specialized domains such as healthcare and finance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
90
|
+
+ removal_note,
|
|
87
91
|
citation=RTEB_CITATION,
|
|
88
92
|
contacts=["fzowl"],
|
|
89
93
|
)
|
|
@@ -101,7 +105,8 @@ RTEB_FRENCH = RtebBenchmark(
|
|
|
101
105
|
],
|
|
102
106
|
languages=["fra"],
|
|
103
107
|
),
|
|
104
|
-
description="RTEB French is a subset of RTEB containing retrieval tasks in French across legal and general knowledge domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
108
|
+
description="RTEB French is a subset of RTEB containing retrieval tasks in French across legal and general knowledge domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
109
|
+
+ removal_note,
|
|
105
110
|
citation=RTEB_CITATION,
|
|
106
111
|
contacts=["fzowl"],
|
|
107
112
|
)
|
|
@@ -119,7 +124,8 @@ RTEB_GERMAN = RtebBenchmark(
|
|
|
119
124
|
"GermanLegal1Retrieval",
|
|
120
125
|
],
|
|
121
126
|
),
|
|
122
|
-
description="RTEB German is a subset of RTEB containing retrieval tasks in German across legal, healthcare, and business domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
127
|
+
description="RTEB German is a subset of RTEB containing retrieval tasks in German across legal, healthcare, and business domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
128
|
+
+ removal_note,
|
|
123
129
|
citation=RTEB_CITATION,
|
|
124
130
|
contacts=["fzowl"],
|
|
125
131
|
)
|
|
@@ -135,7 +141,8 @@ RTEB_JAPANESE = RtebBenchmark(
|
|
|
135
141
|
"JapaneseLegal1Retrieval",
|
|
136
142
|
],
|
|
137
143
|
),
|
|
138
|
-
description="RTEB Japanese is a subset of RTEB containing retrieval tasks in Japanese across legal and code domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
144
|
+
description="RTEB Japanese is a subset of RTEB containing retrieval tasks in Japanese across legal and code domains. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
145
|
+
+ removal_note,
|
|
139
146
|
citation=RTEB_CITATION,
|
|
140
147
|
contacts=["fzowl"],
|
|
141
148
|
)
|
|
@@ -156,7 +163,8 @@ RTEB_FINANCE = RtebBenchmark(
|
|
|
156
163
|
"EnglishFinance4Retrieval",
|
|
157
164
|
],
|
|
158
165
|
),
|
|
159
|
-
description="RTEB Finance is a subset of RTEB containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, financial document retrieval, and corporate governance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
166
|
+
description="RTEB Finance is a subset of RTEB containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, financial document retrieval, and corporate governance. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
167
|
+
+ removal_note,
|
|
160
168
|
citation=RTEB_CITATION,
|
|
161
169
|
contacts=["fzowl"],
|
|
162
170
|
)
|
|
@@ -177,7 +185,8 @@ RTEB_LEGAL = RtebBenchmark(
|
|
|
177
185
|
"JapaneseLegal1Retrieval",
|
|
178
186
|
],
|
|
179
187
|
),
|
|
180
|
-
description="RTEB Legal is a subset of RTEB containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and multilingual legal Q&A. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
188
|
+
description="RTEB Legal is a subset of RTEB containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and multilingual legal Q&A. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
189
|
+
+ removal_note,
|
|
181
190
|
citation=RTEB_CITATION,
|
|
182
191
|
contacts=["fzowl"],
|
|
183
192
|
)
|
|
@@ -199,7 +208,8 @@ RTEB_CODE = RtebBenchmark(
|
|
|
199
208
|
"JapaneseCode1Retrieval",
|
|
200
209
|
],
|
|
201
210
|
),
|
|
202
|
-
description="RTEB Code is a subset of RTEB containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, SQL retrieval, and multilingual code retrieval. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
211
|
+
description="RTEB Code is a subset of RTEB containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, SQL retrieval, and multilingual code retrieval. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
212
|
+
+ removal_note,
|
|
203
213
|
citation=RTEB_CITATION,
|
|
204
214
|
contacts=["fzowl"],
|
|
205
215
|
)
|
|
@@ -217,7 +227,8 @@ RTEB_HEALTHCARE = RtebBenchmark(
|
|
|
217
227
|
"GermanHealthcare1Retrieval",
|
|
218
228
|
],
|
|
219
229
|
),
|
|
220
|
-
description="RTEB Healthcare is a subset of RTEB containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A, healthcare information retrieval, cross-lingual medical retrieval, and multilingual medical consultation. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
230
|
+
description="RTEB Healthcare is a subset of RTEB containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A, healthcare information retrieval, cross-lingual medical retrieval, and multilingual medical consultation. The benchmark includes both open and closed datasets, providing a robust evaluation framework for real-world applications. To submit results on private tasks, please create [open an issue](https://github.com/embeddings-benchmark/mteb/issues)."
|
|
231
|
+
+ removal_note,
|
|
221
232
|
citation=RTEB_CITATION,
|
|
222
233
|
contacts=["fzowl"],
|
|
223
234
|
)
|
mteb/cache.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import gzip
|
|
2
4
|
import io
|
|
3
5
|
import json
|
|
@@ -7,9 +9,8 @@ import shutil
|
|
|
7
9
|
import subprocess
|
|
8
10
|
import warnings
|
|
9
11
|
from collections import defaultdict
|
|
10
|
-
from collections.abc import Iterable, Sequence
|
|
11
12
|
from pathlib import Path
|
|
12
|
-
from typing import cast
|
|
13
|
+
from typing import TYPE_CHECKING, cast
|
|
13
14
|
|
|
14
15
|
import requests
|
|
15
16
|
from pydantic import ValidationError
|
|
@@ -19,7 +20,11 @@ from mteb.abstasks import AbsTask
|
|
|
19
20
|
from mteb.benchmarks.benchmark import Benchmark
|
|
20
21
|
from mteb.models import ModelMeta
|
|
21
22
|
from mteb.results import BenchmarkResults, ModelResult, TaskResult
|
|
22
|
-
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from collections.abc import Iterable, Sequence
|
|
26
|
+
|
|
27
|
+
from mteb.types import ModelName, Revision
|
|
23
28
|
|
|
24
29
|
logger = logging.getLogger(__name__)
|
|
25
30
|
|
|
@@ -584,7 +589,7 @@ class ResultCache:
|
|
|
584
589
|
|
|
585
590
|
first_model = next(iter(models))
|
|
586
591
|
if isinstance(first_model, ModelMeta):
|
|
587
|
-
models = cast(Iterable[ModelMeta], models)
|
|
592
|
+
models = cast("Iterable[ModelMeta]", models)
|
|
588
593
|
name_and_revision = {
|
|
589
594
|
(m.model_name_as_path(), m.revision or "no_revision_available")
|
|
590
595
|
for m in models
|
|
@@ -595,7 +600,7 @@ class ResultCache:
|
|
|
595
600
|
if (p.parent.parent.name, p.parent.name) in name_and_revision
|
|
596
601
|
]
|
|
597
602
|
|
|
598
|
-
str_models = cast(Sequence[str], models)
|
|
603
|
+
str_models = cast("Sequence[str]", models)
|
|
599
604
|
model_names = {m.replace("/", "__").replace(" ", "_") for m in str_models}
|
|
600
605
|
return [p for p in paths if p.parent.parent.name in model_names]
|
|
601
606
|
|
mteb/cli/_display_tasks.py
CHANGED
|
@@ -1,9 +1,15 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
2
4
|
|
|
3
|
-
from mteb.abstasks import AbsTask
|
|
4
|
-
from mteb.benchmarks import Benchmark
|
|
5
5
|
from mteb.get_tasks import MTEBTasks
|
|
6
6
|
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from collections.abc import Iterable, Sequence
|
|
9
|
+
|
|
10
|
+
from mteb.abstasks import AbsTask
|
|
11
|
+
from mteb.benchmarks import Benchmark
|
|
12
|
+
|
|
7
13
|
|
|
8
14
|
def _display_benchmarks(benchmarks: Sequence[Benchmark]) -> None:
|
|
9
15
|
"""Get all benchmarks available in the MTEB."""
|
mteb/cli/build_cli.py
CHANGED
|
@@ -3,17 +3,20 @@ import logging
|
|
|
3
3
|
import os
|
|
4
4
|
import warnings
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
6
7
|
|
|
7
8
|
import torch
|
|
8
9
|
from rich.logging import RichHandler
|
|
9
10
|
|
|
10
11
|
import mteb
|
|
11
|
-
from mteb.abstasks.abstask import AbsTask
|
|
12
12
|
from mteb.cache import ResultCache
|
|
13
13
|
from mteb.cli._display_tasks import _display_benchmarks, _display_tasks
|
|
14
14
|
from mteb.cli.generate_model_card import generate_model_card
|
|
15
15
|
from mteb.evaluate import OverwriteStrategy
|
|
16
|
-
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from mteb.abstasks.abstask import AbsTask
|
|
19
|
+
from mteb.types import EncodeKwargs
|
|
17
20
|
|
|
18
21
|
logger = logging.getLogger(__name__)
|
|
19
22
|
|
mteb/cli/generate_model_card.py
CHANGED
|
@@ -1,14 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import warnings
|
|
3
|
-
from collections.abc import Sequence
|
|
4
5
|
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
5
7
|
|
|
6
8
|
from huggingface_hub import ModelCard, ModelCardData, repo_exists
|
|
7
9
|
|
|
8
10
|
from mteb.abstasks.abstask import AbsTask
|
|
9
|
-
from mteb.benchmarks.benchmark import Benchmark
|
|
10
11
|
from mteb.cache import ResultCache
|
|
11
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Sequence
|
|
15
|
+
|
|
16
|
+
from mteb.abstasks.abstask import AbsTask
|
|
17
|
+
from mteb.benchmarks.benchmark import Benchmark
|
|
18
|
+
|
|
12
19
|
logger = logging.getLogger(__name__)
|
|
13
20
|
|
|
14
21
|
|
mteb/deprecated_evaluator.py
CHANGED
|
@@ -6,7 +6,6 @@ import os
|
|
|
6
6
|
import sys
|
|
7
7
|
import traceback
|
|
8
8
|
import warnings
|
|
9
|
-
from collections.abc import Iterable, Sequence
|
|
10
9
|
from copy import deepcopy
|
|
11
10
|
from datetime import datetime
|
|
12
11
|
from itertools import chain
|
|
@@ -18,26 +17,31 @@ import datasets
|
|
|
18
17
|
|
|
19
18
|
import mteb
|
|
20
19
|
from mteb.abstasks import AbsTask
|
|
21
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
22
|
-
from mteb.abstasks.task_metadata import TaskCategory, TaskType
|
|
23
20
|
from mteb.benchmarks import Benchmark
|
|
24
21
|
from mteb.models import (
|
|
25
22
|
CrossEncoderWrapper,
|
|
26
23
|
ModelMeta,
|
|
27
|
-
MTEBModels,
|
|
28
24
|
SentenceTransformerEncoderWrapper,
|
|
29
25
|
)
|
|
30
26
|
from mteb.results import TaskResult
|
|
31
|
-
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from collections.abc import Iterable, Sequence
|
|
30
|
+
|
|
31
|
+
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
32
|
+
|
|
33
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
34
|
+
from mteb.abstasks.task_metadata import TaskCategory, TaskType
|
|
35
|
+
from mteb.models import (
|
|
36
|
+
MTEBModels,
|
|
37
|
+
)
|
|
38
|
+
from mteb.types import EncodeKwargs, ScoresDict
|
|
32
39
|
|
|
33
40
|
if sys.version_info >= (3, 13):
|
|
34
41
|
from warnings import deprecated
|
|
35
42
|
else:
|
|
36
43
|
from typing_extensions import deprecated
|
|
37
44
|
|
|
38
|
-
if TYPE_CHECKING:
|
|
39
|
-
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
40
|
-
|
|
41
45
|
logger = logging.getLogger(__name__)
|
|
42
46
|
|
|
43
47
|
|
|
@@ -66,9 +70,9 @@ class MTEB:
|
|
|
66
70
|
"""
|
|
67
71
|
if isinstance(next(iter(tasks)), Benchmark):
|
|
68
72
|
self.benchmarks = tasks
|
|
69
|
-
self.tasks = list(chain.from_iterable(cast(Iterable[Benchmark], tasks)))
|
|
73
|
+
self.tasks = list(chain.from_iterable(cast("Iterable[Benchmark]", tasks)))
|
|
70
74
|
elif isinstance(next(iter(tasks)), AbsTask):
|
|
71
|
-
self.tasks = list(cast(Iterable[AbsTask], tasks))
|
|
75
|
+
self.tasks = list(cast("Iterable[AbsTask]", tasks))
|
|
72
76
|
|
|
73
77
|
self.err_logs_path = Path(err_logs_path)
|
|
74
78
|
self._last_evaluated_splits: dict[str, list[str]] = {}
|
|
@@ -313,7 +317,7 @@ class MTEB:
|
|
|
313
317
|
elif isinstance(model, CrossEncoder):
|
|
314
318
|
mteb_model = CrossEncoderWrapper(model)
|
|
315
319
|
else:
|
|
316
|
-
mteb_model = cast(MTEBModels, model)
|
|
320
|
+
mteb_model = cast("MTEBModels", model)
|
|
317
321
|
|
|
318
322
|
meta = self.create_model_meta(mteb_model)
|
|
319
323
|
output_path = self._create_output_folder(meta, output_folder)
|
|
@@ -346,7 +350,7 @@ class MTEB:
|
|
|
346
350
|
)
|
|
347
351
|
|
|
348
352
|
if task.is_aggregate:
|
|
349
|
-
aggregated_task = cast(AbsTaskAggregate, task)
|
|
353
|
+
aggregated_task = cast("AbsTaskAggregate", task)
|
|
350
354
|
self_ = MTEB(tasks=aggregated_task.metadata.tasks)
|
|
351
355
|
aggregated_task_results = self_.run(
|
|
352
356
|
mteb_model,
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"standard": {
|
|
3
|
+
"num_samples": 188113,
|
|
4
|
+
"number_of_characters": 141769714,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 141734227,
|
|
7
|
+
"min_text_length": 58,
|
|
8
|
+
"average_text_length": 753.8974425803981,
|
|
9
|
+
"max_text_length": 7334,
|
|
10
|
+
"unique_texts": 176508
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 35487,
|
|
15
|
+
"min_text_length": 85,
|
|
16
|
+
"average_text_length": 319.7027027027027,
|
|
17
|
+
"max_text_length": 1167,
|
|
18
|
+
"unique_texts": 111
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 524,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 4.7207207207207205,
|
|
25
|
+
"max_relevant_docs_per_query": 8,
|
|
26
|
+
"unique_relevant_docs": 111
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": {
|
|
29
|
+
"num_top_ranked": 20264921,
|
|
30
|
+
"min_top_ranked_per_query": 176954,
|
|
31
|
+
"average_top_ranked_per_query": 182566.85585585586,
|
|
32
|
+
"max_top_ranked_per_query": 186176
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|