mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +78 -30
- mteb/_evaluators/any_sts_evaluator.py +13 -6
- mteb/_evaluators/clustering_evaluator.py +13 -5
- mteb/_evaluators/evaluator.py +12 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
- mteb/_evaluators/pair_classification_evaluator.py +17 -7
- mteb/_evaluators/retrieval_evaluator.py +23 -14
- mteb/_evaluators/retrieval_metrics.py +26 -19
- mteb/_evaluators/sklearn_evaluator.py +27 -17
- mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
- mteb/_evaluators/text/summarization_evaluator.py +31 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +9 -3
- mteb/abstasks/_data_filter/task_pipelines.py +10 -2
- mteb/abstasks/_statistics_calculation.py +21 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +78 -44
- mteb/abstasks/aggregate_task_metadata.py +21 -18
- mteb/abstasks/aggregated_task.py +23 -35
- mteb/abstasks/classification.py +39 -18
- mteb/abstasks/clustering.py +37 -20
- mteb/abstasks/clustering_legacy.py +30 -16
- mteb/abstasks/image/image_text_pair_classification.py +26 -9
- mteb/abstasks/multilabel_classification.py +33 -21
- mteb/abstasks/pair_classification.py +44 -19
- mteb/abstasks/regression.py +18 -10
- mteb/abstasks/retrieval.py +82 -52
- mteb/abstasks/retrieval_dataset_loaders.py +50 -39
- mteb/abstasks/sts.py +34 -15
- mteb/abstasks/task_metadata.py +44 -37
- mteb/abstasks/text/bitext_mining.py +57 -35
- mteb/abstasks/text/reranking.py +10 -8
- mteb/abstasks/text/summarization.py +26 -10
- mteb/abstasks/zeroshot_classification.py +27 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +15 -3
- mteb/benchmarks/benchmarks/__init__.py +6 -0
- mteb/benchmarks/benchmarks/benchmarks.py +153 -13
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +189 -31
- mteb/cli/_display_tasks.py +10 -4
- mteb/cli/build_cli.py +112 -13
- mteb/cli/generate_model_card.py +50 -23
- mteb/deprecated_evaluator.py +72 -54
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +71 -47
- mteb/filter_tasks.py +36 -32
- mteb/get_tasks.py +37 -33
- mteb/languages/language_scripts.py +11 -4
- mteb/leaderboard/app.py +172 -37
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +20 -14
- mteb/models/abs_encoder.py +30 -16
- mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +16 -11
- mteb/models/get_model_meta.py +53 -9
- mteb/models/instruct_wrapper.py +41 -13
- mteb/models/model_implementations/align_models.py +11 -5
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +6 -4
- mteb/models/model_implementations/ara_models.py +2 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +85 -22
- mteb/models/model_implementations/bica_model.py +4 -3
- mteb/models/model_implementations/blip2_models.py +13 -6
- mteb/models/model_implementations/blip_models.py +33 -20
- mteb/models/model_implementations/bm25.py +27 -17
- mteb/models/model_implementations/bmretriever_models.py +16 -6
- mteb/models/model_implementations/cadet_models.py +2 -1
- mteb/models/model_implementations/cde_models.py +22 -9
- mteb/models/model_implementations/clip_models.py +18 -10
- mteb/models/model_implementations/clips_models.py +6 -3
- mteb/models/model_implementations/codefuse_models.py +10 -5
- mteb/models/model_implementations/codesage_models.py +6 -3
- mteb/models/model_implementations/cohere_models.py +19 -9
- mteb/models/model_implementations/cohere_v.py +16 -6
- mteb/models/model_implementations/colpali_models.py +10 -6
- mteb/models/model_implementations/colqwen_models.py +24 -38
- mteb/models/model_implementations/colsmol_models.py +5 -3
- mteb/models/model_implementations/conan_models.py +12 -5
- mteb/models/model_implementations/dino_models.py +70 -46
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +18 -9
- mteb/models/model_implementations/e5_v.py +16 -10
- mteb/models/model_implementations/eagerworks_models.py +12 -5
- mteb/models/model_implementations/emillykkejensen_models.py +9 -6
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +3 -2
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +18 -9
- mteb/models/model_implementations/facebookai.py +16 -2
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +13 -8
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
- mteb/models/model_implementations/gritlm_models.py +5 -2
- mteb/models/model_implementations/gte_models.py +34 -13
- mteb/models/model_implementations/hinvec_models.py +7 -2
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +16 -7
- mteb/models/model_implementations/jina_clip.py +58 -14
- mteb/models/model_implementations/jina_models.py +35 -16
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +13 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
- mteb/models/model_implementations/kfst.py +2 -1
- mteb/models/model_implementations/kowshik24_models.py +2 -1
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +8 -2
- mteb/models/model_implementations/listconranker.py +11 -5
- mteb/models/model_implementations/llm2clip_models.py +18 -10
- mteb/models/model_implementations/llm2vec_models.py +28 -14
- mteb/models/model_implementations/mcinext_models.py +12 -3
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +131 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
- mteb/models/model_implementations/mme5_models.py +3 -2
- mteb/models/model_implementations/moco_models.py +15 -8
- mteb/models/model_implementations/mod_models.py +3 -2
- mteb/models/model_implementations/model2vec_models.py +37 -18
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +6 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
- mteb/models/model_implementations/nomic_models.py +47 -19
- mteb/models/model_implementations/nomic_models_vision.py +6 -4
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
- mteb/models/model_implementations/nvidia_models.py +165 -22
- mteb/models/model_implementations/octen_models.py +64 -3
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +30 -17
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
- mteb/models/model_implementations/ops_moa_models.py +10 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
- mteb/models/model_implementations/pawan_models.py +2 -1
- mteb/models/model_implementations/piccolo_models.py +3 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +20 -10
- mteb/models/model_implementations/pylate_models.py +41 -21
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +14 -4
- mteb/models/model_implementations/qzhou_models.py +4 -2
- mteb/models/model_implementations/random_baseline.py +7 -6
- mteb/models/model_implementations/rasgaard_models.py +3 -2
- mteb/models/model_implementations/reasonir_model.py +66 -1
- mteb/models/model_implementations/repllama_models.py +18 -9
- mteb/models/model_implementations/rerankers_custom.py +25 -10
- mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +40 -20
- mteb/models/model_implementations/ruri_models.py +20 -10
- mteb/models/model_implementations/salesforce_models.py +13 -4
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +142 -22
- mteb/models/model_implementations/shuu_model.py +2 -1
- mteb/models/model_implementations/siglip_models.py +39 -24
- mteb/models/model_implementations/slm_models.py +419 -0
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +4 -2
- mteb/models/model_implementations/text2vec_models.py +12 -3
- mteb/models/model_implementations/ua_sentence_models.py +2 -1
- mteb/models/model_implementations/uae_models.py +17 -5
- mteb/models/model_implementations/vdr_models.py +9 -2
- mteb/models/model_implementations/vi_vn_models.py +12 -6
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +14 -7
- mteb/models/model_implementations/voyage_models.py +136 -4
- mteb/models/model_implementations/voyage_v.py +17 -10
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +2 -1
- mteb/models/model_implementations/yuan_models_en.py +3 -2
- mteb/models/model_meta.py +127 -40
- mteb/models/models_protocols.py +43 -22
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
- mteb/models/search_wrappers.py +63 -29
- mteb/models/sentence_transformer_wrapper.py +52 -26
- mteb/models/vllm_wrapper.py +329 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +48 -35
- mteb/results/model_result.py +68 -32
- mteb/results/task_result.py +110 -72
- mteb/similarity_functions.py +19 -9
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +2 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +16 -16
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +44 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +3 -3
- mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +13 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +18 -5
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import atexit
|
|
4
|
+
import gc
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import torch
|
|
11
|
+
|
|
12
|
+
from mteb._requires_package import requires_package
|
|
13
|
+
from mteb.models import ModelMeta
|
|
14
|
+
from mteb.models.abs_encoder import AbsEncoder
|
|
15
|
+
from mteb.types import PromptType
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from collections.abc import Callable
|
|
19
|
+
|
|
20
|
+
from torch.utils.data import DataLoader
|
|
21
|
+
from vllm.config import PoolerConfig # type: ignore[import-not-found]
|
|
22
|
+
|
|
23
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
24
|
+
from mteb.types import Array, BatchedInput
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
Dtype = Literal["half", "float16", "float", "float32", "bfloat16", "auto"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class VllmWrapperBase:
|
|
33
|
+
"""Wrapper for vllm serving engine."""
|
|
34
|
+
|
|
35
|
+
convert = "auto"
|
|
36
|
+
mteb_model_meta: ModelMeta | None = None
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
model: str | ModelMeta,
|
|
41
|
+
revision: str | None = None,
|
|
42
|
+
*,
|
|
43
|
+
trust_remote_code: bool = True,
|
|
44
|
+
dtype: Dtype = "auto",
|
|
45
|
+
head_dtype: Literal["model"] | Dtype | None = None,
|
|
46
|
+
max_model_len: int | None = None,
|
|
47
|
+
max_num_batched_tokens: int | None = None,
|
|
48
|
+
max_num_seqs: int = 128,
|
|
49
|
+
tensor_parallel_size: int = 1,
|
|
50
|
+
enable_prefix_caching: bool | None = None,
|
|
51
|
+
gpu_memory_utilization: float = 0.9,
|
|
52
|
+
hf_overrides: dict[str, Any] | None = None,
|
|
53
|
+
pooler_config: PoolerConfig | None = None,
|
|
54
|
+
enforce_eager: bool = False,
|
|
55
|
+
**kwargs: Any,
|
|
56
|
+
):
|
|
57
|
+
"""Wrapper for vllm serving engine.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
model: model name string.
|
|
61
|
+
revision: The revision of the model to use.
|
|
62
|
+
trust_remote_code: Whether to trust remote code execution when loading the model.
|
|
63
|
+
Should be True for models with custom code.
|
|
64
|
+
dtype: Data type for model weights. "auto" will automatically select appropriate
|
|
65
|
+
dtype based on hardware and model capabilities. vllm uses flash attention by
|
|
66
|
+
default, which does not support fp32. Therefore, it defaults to using fp16 for
|
|
67
|
+
inference on fp32 models. Testing has shown a relatively small drop in accuracy.
|
|
68
|
+
You can manually opt for fp32, but inference speed will be very slow.
|
|
69
|
+
head_dtype: "head" refers to the last Linear layer(s) of an LLMs, such as the score
|
|
70
|
+
or classifier in a classification model. Uses fp32 for the head by default to
|
|
71
|
+
gain extra precision.
|
|
72
|
+
max_model_len: Maximum sequence length (context window) supported by the model.
|
|
73
|
+
If None, uses the model's default maximum length.
|
|
74
|
+
max_num_batched_tokens: Maximum number of tokens to process in a single batch.
|
|
75
|
+
If None, automatically determined.
|
|
76
|
+
max_num_seqs: Maximum number of sequences to process concurrently.
|
|
77
|
+
tensor_parallel_size: Number of GPUs for tensor parallelism.
|
|
78
|
+
enable_prefix_caching: Whether to enable KV cache sharing for common prompt prefixes.
|
|
79
|
+
If None, uses the model's default setting.
|
|
80
|
+
gpu_memory_utilization: Target GPU memory utilization ratio (0.0 to 1.0).
|
|
81
|
+
hf_overrides: Dictionary mapping Hugging Face configuration keys to override values.
|
|
82
|
+
pooler_config: Controls the behavior of output pooling in pooling models.
|
|
83
|
+
enforce_eager: Whether to disable CUDA graph optimization and use eager execution.
|
|
84
|
+
**kwargs: Additional arguments to pass to the vllm serving engine model.
|
|
85
|
+
"""
|
|
86
|
+
requires_package(
|
|
87
|
+
self,
|
|
88
|
+
"vllm",
|
|
89
|
+
"Wrapper for vllm serving engine",
|
|
90
|
+
install_instruction="pip install mteb[vllm]",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
|
94
|
+
|
|
95
|
+
from vllm import LLM, EngineArgs
|
|
96
|
+
|
|
97
|
+
hf_overrides = {} if hf_overrides is None else hf_overrides
|
|
98
|
+
|
|
99
|
+
if head_dtype is not None:
|
|
100
|
+
hf_overrides["head_dtype"] = head_dtype
|
|
101
|
+
|
|
102
|
+
model_name = model if isinstance(model, str) else model.name
|
|
103
|
+
|
|
104
|
+
if isinstance(model, ModelMeta):
|
|
105
|
+
logger.info(
|
|
106
|
+
"Using revision from model meta. Passed revision will be ignored"
|
|
107
|
+
)
|
|
108
|
+
revision = model.revision
|
|
109
|
+
|
|
110
|
+
args = EngineArgs(
|
|
111
|
+
model=model_name,
|
|
112
|
+
revision=revision,
|
|
113
|
+
runner="pooling",
|
|
114
|
+
convert=self.convert, # type: ignore[arg-type]
|
|
115
|
+
max_model_len=max_model_len,
|
|
116
|
+
max_num_batched_tokens=max_num_batched_tokens,
|
|
117
|
+
max_num_seqs=max_num_seqs,
|
|
118
|
+
tensor_parallel_size=tensor_parallel_size,
|
|
119
|
+
enable_prefix_caching=enable_prefix_caching,
|
|
120
|
+
gpu_memory_utilization=gpu_memory_utilization,
|
|
121
|
+
hf_overrides=hf_overrides,
|
|
122
|
+
pooler_config=pooler_config,
|
|
123
|
+
enforce_eager=enforce_eager,
|
|
124
|
+
trust_remote_code=trust_remote_code,
|
|
125
|
+
dtype=dtype,
|
|
126
|
+
**kwargs,
|
|
127
|
+
)
|
|
128
|
+
self.llm = LLM(**vars(args))
|
|
129
|
+
|
|
130
|
+
if isinstance(model, str):
|
|
131
|
+
self.mteb_model_meta = ModelMeta.from_hub(model=model, revision=revision)
|
|
132
|
+
else:
|
|
133
|
+
self.mteb_model_meta = model
|
|
134
|
+
|
|
135
|
+
atexit.register(self.cleanup)
|
|
136
|
+
|
|
137
|
+
def cleanup(self):
|
|
138
|
+
"""Clean up the VLLM distributed runtime environment and release GPU resources."""
|
|
139
|
+
if self.llm is None:
|
|
140
|
+
return
|
|
141
|
+
|
|
142
|
+
from vllm.distributed import ( # type: ignore[import-not-found]
|
|
143
|
+
cleanup_dist_env_and_memory,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
self.llm = None
|
|
147
|
+
gc.collect()
|
|
148
|
+
cleanup_dist_env_and_memory()
|
|
149
|
+
|
|
150
|
+
def __del__(self):
|
|
151
|
+
try:
|
|
152
|
+
self.cleanup()
|
|
153
|
+
except Exception:
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class VllmEncoderWrapper(AbsEncoder, VllmWrapperBase):
|
|
158
|
+
"""vLLM wrapper for Encoder models.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
model: model name string or ModelMeta.
|
|
162
|
+
revision: The revision of the model to use.
|
|
163
|
+
prompt_dict: A dictionary mapping task names to prompt strings.
|
|
164
|
+
use_instructions: Whether to use instructions from the prompt_dict.
|
|
165
|
+
When False, values from prompt_dict are used as static prompts (prefixes).
|
|
166
|
+
When True, values from prompt_dict are used as instructions to be formatted
|
|
167
|
+
using the instruction_template.
|
|
168
|
+
instruction_template: A template or callable to format instructions.
|
|
169
|
+
Can be a string with '{instruction}' placeholder or a callable that takes
|
|
170
|
+
the instruction and prompt type and returns a formatted string.
|
|
171
|
+
apply_instruction_to_documents: Whether to apply instructions to documents prompts.
|
|
172
|
+
**kwargs: Additional arguments to pass to the vllm serving engine model.
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
convert = "embed"
|
|
176
|
+
|
|
177
|
+
def __init__(
|
|
178
|
+
self,
|
|
179
|
+
model: str | ModelMeta,
|
|
180
|
+
revision: str | None = None,
|
|
181
|
+
prompt_dict: dict[str, str] | None = None,
|
|
182
|
+
use_instructions: bool = False,
|
|
183
|
+
instruction_template: (
|
|
184
|
+
str | Callable[[str, PromptType | None], str] | None
|
|
185
|
+
) = None,
|
|
186
|
+
apply_instruction_to_documents: bool = True,
|
|
187
|
+
**kwargs: Any,
|
|
188
|
+
):
|
|
189
|
+
if use_instructions and instruction_template is None:
|
|
190
|
+
raise ValueError(
|
|
191
|
+
"To use instructions, an instruction_template must be provided. "
|
|
192
|
+
"For example, `Instruction: {instruction}`"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
if (
|
|
196
|
+
isinstance(instruction_template, str)
|
|
197
|
+
and "{instruction}" not in instruction_template
|
|
198
|
+
):
|
|
199
|
+
raise ValueError(
|
|
200
|
+
"Instruction template must contain the string '{instruction}'."
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
self.prompts_dict = prompt_dict
|
|
204
|
+
self.use_instructions = use_instructions
|
|
205
|
+
self.instruction_template = instruction_template
|
|
206
|
+
self.apply_instruction_to_passages = apply_instruction_to_documents
|
|
207
|
+
super().__init__(
|
|
208
|
+
model,
|
|
209
|
+
revision,
|
|
210
|
+
**kwargs,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
def encode(
|
|
214
|
+
self,
|
|
215
|
+
inputs: DataLoader[BatchedInput],
|
|
216
|
+
*,
|
|
217
|
+
task_metadata: TaskMetadata,
|
|
218
|
+
hf_split: str,
|
|
219
|
+
hf_subset: str,
|
|
220
|
+
prompt_type: PromptType | None = None,
|
|
221
|
+
**kwargs: Any,
|
|
222
|
+
) -> Array:
|
|
223
|
+
"""Encodes the given sentences using the encoder.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
inputs: The sentences to encode.
|
|
227
|
+
task_metadata: The metadata of the task. Sentence-transformers uses this to
|
|
228
|
+
determine which prompt to use from a specified dictionary.
|
|
229
|
+
prompt_type: The name type of prompt. (query or passage)
|
|
230
|
+
hf_split: Split of current task
|
|
231
|
+
hf_subset: Subset of current task
|
|
232
|
+
**kwargs: Additional arguments to pass to the encoder.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
The encoded sentences.
|
|
236
|
+
"""
|
|
237
|
+
prompt = ""
|
|
238
|
+
if self.use_instructions and self.prompts_dict is not None:
|
|
239
|
+
prompt = self.get_task_instruction(task_metadata, prompt_type)
|
|
240
|
+
elif self.prompts_dict is not None:
|
|
241
|
+
prompt_name = self.get_prompt_name(task_metadata, prompt_type)
|
|
242
|
+
if prompt_name is not None:
|
|
243
|
+
prompt = self.prompts_dict.get(prompt_name, "")
|
|
244
|
+
|
|
245
|
+
if (
|
|
246
|
+
self.use_instructions
|
|
247
|
+
and self.apply_instruction_to_passages is False
|
|
248
|
+
and prompt_type == PromptType.document
|
|
249
|
+
):
|
|
250
|
+
logger.info(
|
|
251
|
+
f"No instruction used, because prompt type = {prompt_type.document}"
|
|
252
|
+
)
|
|
253
|
+
prompt = ""
|
|
254
|
+
else:
|
|
255
|
+
logger.info(
|
|
256
|
+
f"Using instruction: '{prompt}' for task: '{task_metadata.name}' prompt type: '{prompt_type}'"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
prompts = [prompt + text for batch in inputs for text in batch["text"]]
|
|
260
|
+
outputs = self.llm.encode(
|
|
261
|
+
prompts, pooling_task="embed", truncate_prompt_tokens=-1
|
|
262
|
+
)
|
|
263
|
+
embeddings = torch.stack([output.outputs.data for output in outputs])
|
|
264
|
+
return embeddings
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
class VllmCrossEncoderWrapper(VllmWrapperBase):
|
|
268
|
+
"""vLLM wrapper for CrossEncoder models."""
|
|
269
|
+
|
|
270
|
+
convert = "classify"
|
|
271
|
+
|
|
272
|
+
def __init__(
|
|
273
|
+
self,
|
|
274
|
+
model: str | ModelMeta,
|
|
275
|
+
revision: str | None = None,
|
|
276
|
+
query_prefix: str = "",
|
|
277
|
+
document_prefix: str = "",
|
|
278
|
+
**kwargs: Any,
|
|
279
|
+
):
|
|
280
|
+
super().__init__(
|
|
281
|
+
model,
|
|
282
|
+
revision,
|
|
283
|
+
**kwargs,
|
|
284
|
+
)
|
|
285
|
+
self.query_prefix = query_prefix
|
|
286
|
+
self.document_prefix = document_prefix
|
|
287
|
+
|
|
288
|
+
def predict(
|
|
289
|
+
self,
|
|
290
|
+
inputs1: DataLoader[BatchedInput],
|
|
291
|
+
inputs2: DataLoader[BatchedInput],
|
|
292
|
+
*,
|
|
293
|
+
task_metadata: TaskMetadata,
|
|
294
|
+
hf_split: str,
|
|
295
|
+
hf_subset: str,
|
|
296
|
+
prompt_type: PromptType | None = None,
|
|
297
|
+
**kwargs: Any,
|
|
298
|
+
) -> Array:
|
|
299
|
+
"""Predicts relevance scores for pairs of inputs. Note that, unlike the encoder, the cross-encoder can compare across inputs.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
inputs1: First Dataloader of inputs to encode. For reranking tasks, these are queries (for text only tasks `QueryDatasetType`).
|
|
303
|
+
inputs2: Second Dataloader of inputs to encode. For reranking, these are documents (for text only tasks `RetrievalOutputType`).
|
|
304
|
+
task_metadata: Metadata of the current task.
|
|
305
|
+
hf_split: Split of current task, allows to know some additional information about current split.
|
|
306
|
+
E.g. Current language
|
|
307
|
+
hf_subset: Subset of current task. Similar to `hf_split` to get more information
|
|
308
|
+
prompt_type: The name type of prompt. (query or passage)
|
|
309
|
+
**kwargs: Additional arguments to pass to the cross-encoder.
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
The predicted relevance scores for each inputs pair.
|
|
313
|
+
"""
|
|
314
|
+
queries = [
|
|
315
|
+
self.query_prefix + text for batch in inputs1 for text in batch["text"]
|
|
316
|
+
]
|
|
317
|
+
corpus = [
|
|
318
|
+
self.document_prefix + text for batch in inputs2 for text in batch["text"]
|
|
319
|
+
]
|
|
320
|
+
# TODO: support score prompt
|
|
321
|
+
|
|
322
|
+
outputs = self.llm.score(
|
|
323
|
+
queries,
|
|
324
|
+
corpus,
|
|
325
|
+
truncate_prompt_tokens=-1,
|
|
326
|
+
use_tqdm=False,
|
|
327
|
+
)
|
|
328
|
+
scores = np.array([output.outputs.score for output in outputs])
|
|
329
|
+
return scores
|
mteb/py.typed
ADDED
|
File without changes
|
|
@@ -1,43 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import functools
|
|
2
4
|
import json
|
|
3
5
|
import logging
|
|
4
6
|
import warnings
|
|
5
|
-
from collections.abc import Callable, Iterable, Iterator, Sequence
|
|
6
7
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Literal
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
8
9
|
|
|
9
10
|
import pandas as pd
|
|
10
11
|
from packaging.version import InvalidVersion, Version
|
|
11
12
|
from pydantic import BaseModel, ConfigDict
|
|
12
|
-
from typing_extensions import Self
|
|
13
13
|
|
|
14
|
-
from mteb.abstasks.abstask import AbsTask
|
|
15
|
-
from mteb.abstasks.task_metadata import (
|
|
16
|
-
TaskDomain,
|
|
17
|
-
TaskType,
|
|
18
|
-
)
|
|
19
14
|
from mteb.benchmarks.benchmark import Benchmark
|
|
20
15
|
from mteb.models import ModelMeta
|
|
21
16
|
from mteb.models.get_model_meta import get_model_metas
|
|
22
|
-
from mteb.types import (
|
|
23
|
-
ISOLanguage,
|
|
24
|
-
ISOLanguageScript,
|
|
25
|
-
Modalities,
|
|
26
|
-
Score,
|
|
27
|
-
ScoresDict,
|
|
28
|
-
SplitName,
|
|
29
|
-
)
|
|
30
17
|
|
|
31
18
|
from .model_result import ModelResult, _aggregate_and_pivot
|
|
32
19
|
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from collections.abc import Callable, Iterable, Iterator
|
|
22
|
+
|
|
23
|
+
from typing_extensions import Self
|
|
24
|
+
|
|
25
|
+
from mteb.abstasks.abstask import AbsTask
|
|
26
|
+
from mteb.abstasks.task_metadata import (
|
|
27
|
+
TaskDomain,
|
|
28
|
+
TaskType,
|
|
29
|
+
)
|
|
30
|
+
from mteb.types import (
|
|
31
|
+
ISOLanguage,
|
|
32
|
+
ISOLanguageScript,
|
|
33
|
+
Modalities,
|
|
34
|
+
Score,
|
|
35
|
+
ScoresDict,
|
|
36
|
+
SplitName,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
33
40
|
logger = logging.getLogger(__name__)
|
|
34
41
|
|
|
35
42
|
|
|
36
|
-
# Global cache for model metas and version parsing
|
|
37
43
|
@functools.lru_cache
|
|
38
44
|
def _get_cached_model_metas() -> dict[str, str | None]:
|
|
39
45
|
"""Cache model metas to avoid repeated calls."""
|
|
40
|
-
return {
|
|
46
|
+
return {
|
|
47
|
+
meta.name: meta.revision for meta in get_model_metas() if meta.name is not None
|
|
48
|
+
}
|
|
41
49
|
|
|
42
50
|
|
|
43
51
|
@functools.lru_cache(maxsize=10000)
|
|
@@ -77,10 +85,10 @@ class BenchmarkResults(BaseModel):
|
|
|
77
85
|
task_names: list[str] | None = None,
|
|
78
86
|
languages: list[str] | None = None,
|
|
79
87
|
domains: list[TaskDomain] | None = None,
|
|
80
|
-
task_types: list[TaskType] | None = None,
|
|
88
|
+
task_types: list[TaskType] | None = None,
|
|
81
89
|
modalities: list[Modalities] | None = None,
|
|
82
90
|
is_public: bool | None = None,
|
|
83
|
-
) ->
|
|
91
|
+
) -> BenchmarkResults:
|
|
84
92
|
# TODO: Same as filter_models
|
|
85
93
|
model_results = [
|
|
86
94
|
res._filter_tasks(
|
|
@@ -97,7 +105,7 @@ class BenchmarkResults(BaseModel):
|
|
|
97
105
|
model_results=[res for res in model_results if res.task_results]
|
|
98
106
|
)
|
|
99
107
|
|
|
100
|
-
def select_tasks(self, tasks:
|
|
108
|
+
def select_tasks(self, tasks: Iterable[AbsTask]) -> BenchmarkResults:
|
|
101
109
|
"""Select tasks from the benchmark results.
|
|
102
110
|
|
|
103
111
|
Args:
|
|
@@ -115,7 +123,7 @@ class BenchmarkResults(BaseModel):
|
|
|
115
123
|
self,
|
|
116
124
|
names: list[str] | list[ModelMeta],
|
|
117
125
|
revisions: list[str | None] | None = None,
|
|
118
|
-
) ->
|
|
126
|
+
) -> BenchmarkResults:
|
|
119
127
|
"""Get models by name and revision.
|
|
120
128
|
|
|
121
129
|
Args:
|
|
@@ -128,7 +136,7 @@ class BenchmarkResults(BaseModel):
|
|
|
128
136
|
models_res = []
|
|
129
137
|
_revisions = revisions if revisions is not None else [None] * len(names)
|
|
130
138
|
|
|
131
|
-
name_rev = {}
|
|
139
|
+
name_rev: dict[str, str | None] = {}
|
|
132
140
|
|
|
133
141
|
if len(names) != len(_revisions):
|
|
134
142
|
raise ValueError(
|
|
@@ -137,9 +145,12 @@ class BenchmarkResults(BaseModel):
|
|
|
137
145
|
|
|
138
146
|
for name, revision in zip(names, _revisions):
|
|
139
147
|
if isinstance(name, ModelMeta):
|
|
148
|
+
if name.name is None:
|
|
149
|
+
raise ValueError("name in ModelMeta is None. It must be a string.")
|
|
140
150
|
name_rev[name.name] = name.revision
|
|
141
151
|
else:
|
|
142
|
-
|
|
152
|
+
name_ = cast("str", name)
|
|
153
|
+
name_rev[name_] = revision
|
|
143
154
|
|
|
144
155
|
for model_res in self.model_results:
|
|
145
156
|
model_name = model_res.model_name
|
|
@@ -159,7 +170,7 @@ class BenchmarkResults(BaseModel):
|
|
|
159
170
|
n_parameters_range: tuple[int | None, int | None] = (None, None),
|
|
160
171
|
use_instructions: bool | None = None,
|
|
161
172
|
zero_shot_on: list[AbsTask] | None = None,
|
|
162
|
-
) ->
|
|
173
|
+
) -> BenchmarkResults:
|
|
163
174
|
# mostly a utility function for the leaderboard app.
|
|
164
175
|
# I would probably move the filtering of the models outside of this call. No need to call get_model_metas inside the filter.
|
|
165
176
|
# interface would then be the same as the get_models function
|
|
@@ -182,7 +193,7 @@ class BenchmarkResults(BaseModel):
|
|
|
182
193
|
|
|
183
194
|
return type(self).model_construct(model_results=new_model_results)
|
|
184
195
|
|
|
185
|
-
def join_revisions(self) ->
|
|
196
|
+
def join_revisions(self) -> BenchmarkResults:
|
|
186
197
|
"""Join revisions of the same model.
|
|
187
198
|
|
|
188
199
|
In case of conflicts, the following rules are applied:
|
|
@@ -212,10 +223,10 @@ class BenchmarkResults(BaseModel):
|
|
|
212
223
|
|
|
213
224
|
# Use cached model metas
|
|
214
225
|
model_to_main_revision = _get_cached_model_metas()
|
|
215
|
-
task_df["main_revision"] = task_df["model"].map(model_to_main_revision)
|
|
226
|
+
task_df["main_revision"] = task_df["model"].map(model_to_main_revision)
|
|
216
227
|
|
|
217
228
|
# Use cached version parsing
|
|
218
|
-
task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached)
|
|
229
|
+
task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached)
|
|
219
230
|
|
|
220
231
|
# Filter out rows without scores first
|
|
221
232
|
task_df = task_df[task_df["has_scores"]]
|
|
@@ -259,8 +270,8 @@ class BenchmarkResults(BaseModel):
|
|
|
259
270
|
# so grouping by original revision ensures consistent ModelResult creation
|
|
260
271
|
for (model, model_revision), group in task_df.groupby(["model", "revision"]):
|
|
261
272
|
model_result = ModelResult.model_construct(
|
|
262
|
-
model_name=model,
|
|
263
|
-
model_revision=model_revision,
|
|
273
|
+
model_name=model, # type: ignore[arg-type]
|
|
274
|
+
model_revision=model_revision, # type: ignore[arg-type]
|
|
264
275
|
task_results=list(group["task_result"]),
|
|
265
276
|
)
|
|
266
277
|
model_results.append(model_result)
|
|
@@ -291,7 +302,7 @@ class BenchmarkResults(BaseModel):
|
|
|
291
302
|
{
|
|
292
303
|
"model": model_res.model_name,
|
|
293
304
|
"revision": model_res.model_revision,
|
|
294
|
-
**model_scores,
|
|
305
|
+
**model_scores,
|
|
295
306
|
}
|
|
296
307
|
)
|
|
297
308
|
except Exception as e:
|
|
@@ -364,7 +375,9 @@ class BenchmarkResults(BaseModel):
|
|
|
364
375
|
scores_data.extend(model_result._get_score_for_table())
|
|
365
376
|
|
|
366
377
|
if not scores_data:
|
|
367
|
-
|
|
378
|
+
msg = "No scores data available. Returning empty DataFrame."
|
|
379
|
+
logger.warning(msg)
|
|
380
|
+
warnings.warn(msg)
|
|
368
381
|
return pd.DataFrame()
|
|
369
382
|
|
|
370
383
|
# Create DataFrame
|
|
@@ -402,7 +415,7 @@ class BenchmarkResults(BaseModel):
|
|
|
402
415
|
|
|
403
416
|
return self.benchmark._create_summary_table(self)
|
|
404
417
|
|
|
405
|
-
def __iter__(self) -> Iterator[ModelResult]:
|
|
418
|
+
def __iter__(self) -> Iterator[ModelResult]: # type: ignore[override]
|
|
406
419
|
return iter(self.model_results)
|
|
407
420
|
|
|
408
421
|
def __getitem__(self, index: int) -> ModelResult:
|
|
@@ -424,11 +437,11 @@ class BenchmarkResults(BaseModel):
|
|
|
424
437
|
out_file.write(self.model_dump_json(indent=2))
|
|
425
438
|
|
|
426
439
|
@classmethod
|
|
427
|
-
def from_validated(cls, **data) ->
|
|
440
|
+
def from_validated(cls, **data: Any) -> BenchmarkResults:
|
|
428
441
|
"""Create BenchmarkResults from validated data.
|
|
429
442
|
|
|
430
443
|
Args:
|
|
431
|
-
data:
|
|
444
|
+
**data: Arbitrary keyword arguments containing the data.
|
|
432
445
|
|
|
433
446
|
Returns:
|
|
434
447
|
An instance of BenchmarkResults.
|