mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +78 -30
- mteb/_evaluators/any_sts_evaluator.py +13 -6
- mteb/_evaluators/clustering_evaluator.py +13 -5
- mteb/_evaluators/evaluator.py +12 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
- mteb/_evaluators/pair_classification_evaluator.py +17 -7
- mteb/_evaluators/retrieval_evaluator.py +23 -14
- mteb/_evaluators/retrieval_metrics.py +26 -19
- mteb/_evaluators/sklearn_evaluator.py +27 -17
- mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
- mteb/_evaluators/text/summarization_evaluator.py +31 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +9 -3
- mteb/abstasks/_data_filter/task_pipelines.py +10 -2
- mteb/abstasks/_statistics_calculation.py +21 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +78 -44
- mteb/abstasks/aggregate_task_metadata.py +21 -18
- mteb/abstasks/aggregated_task.py +23 -35
- mteb/abstasks/classification.py +39 -18
- mteb/abstasks/clustering.py +37 -20
- mteb/abstasks/clustering_legacy.py +30 -16
- mteb/abstasks/image/image_text_pair_classification.py +26 -9
- mteb/abstasks/multilabel_classification.py +33 -21
- mteb/abstasks/pair_classification.py +44 -19
- mteb/abstasks/regression.py +18 -10
- mteb/abstasks/retrieval.py +82 -52
- mteb/abstasks/retrieval_dataset_loaders.py +50 -39
- mteb/abstasks/sts.py +34 -15
- mteb/abstasks/task_metadata.py +44 -37
- mteb/abstasks/text/bitext_mining.py +57 -35
- mteb/abstasks/text/reranking.py +10 -8
- mteb/abstasks/text/summarization.py +26 -10
- mteb/abstasks/zeroshot_classification.py +27 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +15 -3
- mteb/benchmarks/benchmarks/__init__.py +6 -0
- mteb/benchmarks/benchmarks/benchmarks.py +153 -13
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +189 -31
- mteb/cli/_display_tasks.py +10 -4
- mteb/cli/build_cli.py +112 -13
- mteb/cli/generate_model_card.py +50 -23
- mteb/deprecated_evaluator.py +72 -54
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +71 -47
- mteb/filter_tasks.py +36 -32
- mteb/get_tasks.py +37 -33
- mteb/languages/language_scripts.py +11 -4
- mteb/leaderboard/app.py +172 -37
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +20 -14
- mteb/models/abs_encoder.py +30 -16
- mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +16 -11
- mteb/models/get_model_meta.py +53 -9
- mteb/models/instruct_wrapper.py +41 -13
- mteb/models/model_implementations/align_models.py +11 -5
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +6 -4
- mteb/models/model_implementations/ara_models.py +2 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +85 -22
- mteb/models/model_implementations/bica_model.py +4 -3
- mteb/models/model_implementations/blip2_models.py +13 -6
- mteb/models/model_implementations/blip_models.py +33 -20
- mteb/models/model_implementations/bm25.py +27 -17
- mteb/models/model_implementations/bmretriever_models.py +16 -6
- mteb/models/model_implementations/cadet_models.py +2 -1
- mteb/models/model_implementations/cde_models.py +22 -9
- mteb/models/model_implementations/clip_models.py +18 -10
- mteb/models/model_implementations/clips_models.py +6 -3
- mteb/models/model_implementations/codefuse_models.py +10 -5
- mteb/models/model_implementations/codesage_models.py +6 -3
- mteb/models/model_implementations/cohere_models.py +19 -9
- mteb/models/model_implementations/cohere_v.py +16 -6
- mteb/models/model_implementations/colpali_models.py +10 -6
- mteb/models/model_implementations/colqwen_models.py +24 -38
- mteb/models/model_implementations/colsmol_models.py +5 -3
- mteb/models/model_implementations/conan_models.py +12 -5
- mteb/models/model_implementations/dino_models.py +70 -46
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +18 -9
- mteb/models/model_implementations/e5_v.py +16 -10
- mteb/models/model_implementations/eagerworks_models.py +12 -5
- mteb/models/model_implementations/emillykkejensen_models.py +9 -6
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +3 -2
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +18 -9
- mteb/models/model_implementations/facebookai.py +16 -2
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +13 -8
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
- mteb/models/model_implementations/gritlm_models.py +5 -2
- mteb/models/model_implementations/gte_models.py +34 -13
- mteb/models/model_implementations/hinvec_models.py +7 -2
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +16 -7
- mteb/models/model_implementations/jina_clip.py +58 -14
- mteb/models/model_implementations/jina_models.py +35 -16
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +13 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
- mteb/models/model_implementations/kfst.py +2 -1
- mteb/models/model_implementations/kowshik24_models.py +2 -1
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +8 -2
- mteb/models/model_implementations/listconranker.py +11 -5
- mteb/models/model_implementations/llm2clip_models.py +18 -10
- mteb/models/model_implementations/llm2vec_models.py +28 -14
- mteb/models/model_implementations/mcinext_models.py +12 -3
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +131 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
- mteb/models/model_implementations/mme5_models.py +3 -2
- mteb/models/model_implementations/moco_models.py +15 -8
- mteb/models/model_implementations/mod_models.py +3 -2
- mteb/models/model_implementations/model2vec_models.py +37 -18
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +6 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
- mteb/models/model_implementations/nomic_models.py +47 -19
- mteb/models/model_implementations/nomic_models_vision.py +6 -4
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
- mteb/models/model_implementations/nvidia_models.py +165 -22
- mteb/models/model_implementations/octen_models.py +64 -3
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +30 -17
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
- mteb/models/model_implementations/ops_moa_models.py +10 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
- mteb/models/model_implementations/pawan_models.py +2 -1
- mteb/models/model_implementations/piccolo_models.py +3 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +20 -10
- mteb/models/model_implementations/pylate_models.py +41 -21
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +14 -4
- mteb/models/model_implementations/qzhou_models.py +4 -2
- mteb/models/model_implementations/random_baseline.py +7 -6
- mteb/models/model_implementations/rasgaard_models.py +3 -2
- mteb/models/model_implementations/reasonir_model.py +66 -1
- mteb/models/model_implementations/repllama_models.py +18 -9
- mteb/models/model_implementations/rerankers_custom.py +25 -10
- mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +40 -20
- mteb/models/model_implementations/ruri_models.py +20 -10
- mteb/models/model_implementations/salesforce_models.py +13 -4
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +142 -22
- mteb/models/model_implementations/shuu_model.py +2 -1
- mteb/models/model_implementations/siglip_models.py +39 -24
- mteb/models/model_implementations/slm_models.py +419 -0
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +4 -2
- mteb/models/model_implementations/text2vec_models.py +12 -3
- mteb/models/model_implementations/ua_sentence_models.py +2 -1
- mteb/models/model_implementations/uae_models.py +17 -5
- mteb/models/model_implementations/vdr_models.py +9 -2
- mteb/models/model_implementations/vi_vn_models.py +12 -6
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +14 -7
- mteb/models/model_implementations/voyage_models.py +136 -4
- mteb/models/model_implementations/voyage_v.py +17 -10
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +2 -1
- mteb/models/model_implementations/yuan_models_en.py +3 -2
- mteb/models/model_meta.py +127 -40
- mteb/models/models_protocols.py +43 -22
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
- mteb/models/search_wrappers.py +63 -29
- mteb/models/sentence_transformer_wrapper.py +52 -26
- mteb/models/vllm_wrapper.py +329 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +48 -35
- mteb/results/model_result.py +68 -32
- mteb/results/task_result.py +110 -72
- mteb/similarity_functions.py +19 -9
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +2 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +16 -16
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +44 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +3 -3
- mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +13 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +18 -5
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
mteb/leaderboard/table.py
CHANGED
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
1
5
|
import gradio as gr
|
|
2
6
|
import matplotlib.pyplot as plt
|
|
3
7
|
import numpy as np
|
|
@@ -5,8 +9,9 @@ import pandas as pd
|
|
|
5
9
|
from matplotlib.colors import LinearSegmentedColormap
|
|
6
10
|
from pandas.api.types import is_numeric_dtype
|
|
7
11
|
|
|
8
|
-
|
|
9
|
-
from mteb.
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from mteb.benchmarks.benchmark import Benchmark
|
|
14
|
+
from mteb.results.benchmark_results import BenchmarkResults
|
|
10
15
|
|
|
11
16
|
|
|
12
17
|
def _borda_count(scores: pd.Series) -> pd.Series:
|
mteb/load_results.py
CHANGED
|
@@ -1,13 +1,19 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import logging
|
|
3
5
|
import sys
|
|
4
|
-
from
|
|
5
|
-
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
6
7
|
|
|
7
8
|
from mteb.abstasks.abstask import AbsTask
|
|
8
9
|
from mteb.models.model_meta import ModelMeta
|
|
9
10
|
from mteb.results import BenchmarkResults, ModelResult, TaskResult
|
|
10
|
-
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Iterable, Sequence
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from mteb.types import ModelName, Revision
|
|
11
17
|
|
|
12
18
|
if sys.version_info >= (3, 13):
|
|
13
19
|
from warnings import deprecated
|
|
@@ -45,8 +51,8 @@ def _model_name_and_revision(
|
|
|
45
51
|
def load_results(
|
|
46
52
|
results_repo: str = "https://github.com/embeddings-benchmark/results",
|
|
47
53
|
download_latest: bool = True,
|
|
48
|
-
models:
|
|
49
|
-
tasks:
|
|
54
|
+
models: Iterable[ModelMeta] | Sequence[str] | None = None,
|
|
55
|
+
tasks: Iterable[AbsTask] | Sequence[str] | None = None,
|
|
50
56
|
validate_and_filter: bool = True,
|
|
51
57
|
require_model_meta: bool = True,
|
|
52
58
|
only_main_score: bool = False,
|
|
@@ -83,21 +89,21 @@ def load_results(
|
|
|
83
89
|
|
|
84
90
|
if models is not None:
|
|
85
91
|
models_to_keep = {}
|
|
86
|
-
for
|
|
87
|
-
if isinstance(
|
|
88
|
-
models_to_keep[
|
|
92
|
+
for model in models:
|
|
93
|
+
if isinstance(model, ModelMeta):
|
|
94
|
+
models_to_keep[model.name] = model.revision
|
|
89
95
|
else:
|
|
90
|
-
models_to_keep[
|
|
96
|
+
models_to_keep[model] = None
|
|
91
97
|
else:
|
|
92
98
|
models_to_keep = None
|
|
93
99
|
|
|
94
|
-
task_names = {}
|
|
100
|
+
task_names: dict[str, AbsTask | None] = {}
|
|
95
101
|
if tasks is not None:
|
|
96
|
-
for
|
|
97
|
-
if isinstance(
|
|
98
|
-
task_names[
|
|
102
|
+
for task_ in tasks:
|
|
103
|
+
if isinstance(task_, AbsTask):
|
|
104
|
+
task_names[task_.metadata.name] = task_
|
|
99
105
|
else:
|
|
100
|
-
task_names[
|
|
106
|
+
task_names[task_] = None
|
|
101
107
|
|
|
102
108
|
model_results = []
|
|
103
109
|
for model_path in model_paths:
|
mteb/models/abs_encoder.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
4
|
+
import warnings
|
|
2
5
|
from abc import ABC, abstractmethod
|
|
3
|
-
from
|
|
4
|
-
from typing import Any, Literal, cast, get_args, overload
|
|
5
|
-
|
|
6
|
-
from torch.utils.data import DataLoader
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal, cast, get_args, overload
|
|
7
7
|
|
|
8
8
|
import mteb
|
|
9
|
-
from mteb.abstasks.task_metadata import
|
|
9
|
+
from mteb.abstasks.task_metadata import TaskType
|
|
10
10
|
from mteb.similarity_functions import (
|
|
11
11
|
cos_sim,
|
|
12
12
|
dot_score,
|
|
@@ -16,12 +16,25 @@ from mteb.similarity_functions import (
|
|
|
16
16
|
pairwise_max_sim,
|
|
17
17
|
)
|
|
18
18
|
from mteb.types import (
|
|
19
|
-
Array,
|
|
20
|
-
BatchedInput,
|
|
21
19
|
PromptType,
|
|
22
20
|
)
|
|
23
21
|
|
|
24
|
-
from .model_meta import
|
|
22
|
+
from .model_meta import ScoringFunction
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from collections.abc import Callable, Sequence
|
|
26
|
+
|
|
27
|
+
from torch.utils.data import DataLoader
|
|
28
|
+
from typing_extensions import Unpack
|
|
29
|
+
|
|
30
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
31
|
+
from mteb.types import (
|
|
32
|
+
Array,
|
|
33
|
+
BatchedInput,
|
|
34
|
+
EncodeKwargs,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
from .model_meta import ModelMeta
|
|
25
38
|
|
|
26
39
|
logger = logging.getLogger(__name__)
|
|
27
40
|
|
|
@@ -43,7 +56,7 @@ class AbsEncoder(ABC):
|
|
|
43
56
|
model: Any
|
|
44
57
|
mteb_model_meta: ModelMeta | None = None
|
|
45
58
|
model_prompts: dict[str, str] | None = None
|
|
46
|
-
instruction_template: str | Callable[[str, PromptType], str] | None = None
|
|
59
|
+
instruction_template: str | Callable[[str, PromptType | None], str] | None = None
|
|
47
60
|
prompts_dict: dict[str, str] | None = None
|
|
48
61
|
|
|
49
62
|
def get_prompt_name(
|
|
@@ -110,7 +123,7 @@ class AbsEncoder(ABC):
|
|
|
110
123
|
if not self.model_prompts:
|
|
111
124
|
return None
|
|
112
125
|
prompt_name = self.get_prompt_name(task_metadata, prompt_type)
|
|
113
|
-
return self.model_prompts.get(prompt_name)
|
|
126
|
+
return self.model_prompts.get(prompt_name) if prompt_name else None
|
|
114
127
|
|
|
115
128
|
@staticmethod
|
|
116
129
|
@overload
|
|
@@ -187,6 +200,7 @@ class AbsEncoder(ABC):
|
|
|
187
200
|
except KeyError:
|
|
188
201
|
msg = f"Task name {task_name} is not valid. {valid_keys_msg}"
|
|
189
202
|
logger.warning(msg)
|
|
203
|
+
warnings.warn(msg)
|
|
190
204
|
invalid_task_messages.add(msg)
|
|
191
205
|
invalid_keys.add(task_key)
|
|
192
206
|
|
|
@@ -232,9 +246,9 @@ class AbsEncoder(ABC):
|
|
|
232
246
|
if isinstance(prompt, dict) and prompt_type:
|
|
233
247
|
if prompt.get(prompt_type.value):
|
|
234
248
|
return prompt[prompt_type.value]
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
)
|
|
249
|
+
msg = f"Prompt type '{prompt_type}' not found in task metadata for task '{task_metadata.name}'."
|
|
250
|
+
logger.warning(msg)
|
|
251
|
+
warnings.warn(msg)
|
|
238
252
|
return ""
|
|
239
253
|
|
|
240
254
|
if prompt:
|
|
@@ -310,7 +324,7 @@ class AbsEncoder(ABC):
|
|
|
310
324
|
):
|
|
311
325
|
arr = self.model.similarity(embeddings1, embeddings2)
|
|
312
326
|
# We assume that the model returns an Array-like object:
|
|
313
|
-
arr = cast(Array, arr)
|
|
327
|
+
arr = cast("Array", arr)
|
|
314
328
|
return arr
|
|
315
329
|
return cos_sim(embeddings1, embeddings2)
|
|
316
330
|
if self.mteb_model_meta.similarity_fn_name is ScoringFunction.COSINE:
|
|
@@ -348,7 +362,7 @@ class AbsEncoder(ABC):
|
|
|
348
362
|
):
|
|
349
363
|
arr = self.model.similarity_pairwise(embeddings1, embeddings2)
|
|
350
364
|
# We assume that the model returns an Array-like object:
|
|
351
|
-
arr = cast(Array, arr)
|
|
365
|
+
arr = cast("Array", arr)
|
|
352
366
|
return arr
|
|
353
367
|
return pairwise_cos_sim(embeddings1, embeddings2)
|
|
354
368
|
if self.mteb_model_meta.similarity_fn_name is ScoringFunction.COSINE:
|
|
@@ -368,7 +382,7 @@ class AbsEncoder(ABC):
|
|
|
368
382
|
hf_split: str,
|
|
369
383
|
hf_subset: str,
|
|
370
384
|
prompt_type: PromptType | None = None,
|
|
371
|
-
**kwargs:
|
|
385
|
+
**kwargs: Unpack[EncodeKwargs],
|
|
372
386
|
) -> Array:
|
|
373
387
|
"""Encodes the given sentences using the encoder.
|
|
374
388
|
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from typing import Any, Protocol, runtime_checkable
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
|
|
5
4
|
|
|
6
|
-
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from pathlib import Path
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
import numpy as np
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
@runtime_checkable
|
|
@@ -26,7 +26,7 @@ class CacheBackendProtocol(Protocol):
|
|
|
26
26
|
**kwargs: Additional backend-specific arguments.
|
|
27
27
|
"""
|
|
28
28
|
|
|
29
|
-
def add(self, item: list[
|
|
29
|
+
def add(self, item: list[dict[str, Any]], vectors: np.ndarray) -> None:
|
|
30
30
|
"""Add a vector to the cache.
|
|
31
31
|
|
|
32
32
|
Args:
|
|
@@ -34,7 +34,7 @@ class CacheBackendProtocol(Protocol):
|
|
|
34
34
|
vectors: Embedding vector of shape (dim,) or (1, dim).
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
|
-
def get_vector(self, item:
|
|
37
|
+
def get_vector(self, item: dict[str, Any]) -> np.ndarray | None:
|
|
38
38
|
"""Retrieve the cached vector for the given item.
|
|
39
39
|
|
|
40
40
|
Args:
|
|
@@ -53,5 +53,5 @@ class CacheBackendProtocol(Protocol):
|
|
|
53
53
|
def close(self) -> None:
|
|
54
54
|
"""Release resources or flush data."""
|
|
55
55
|
|
|
56
|
-
def __contains__(self, item:
|
|
56
|
+
def __contains__(self, item: dict[str, Any]) -> bool:
|
|
57
57
|
"""Check whether the cache contains an item."""
|
|
@@ -1,16 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import hashlib
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from collections.abc import Mapping
|
|
2
8
|
|
|
3
|
-
from
|
|
9
|
+
from PIL import Image
|
|
4
10
|
|
|
5
11
|
|
|
6
|
-
def _hash_item(item:
|
|
12
|
+
def _hash_item(item: Mapping[str, Any]) -> str:
|
|
7
13
|
item_hash = ""
|
|
8
14
|
if "text" in item:
|
|
9
|
-
|
|
15
|
+
item_text: str = item["text"]
|
|
16
|
+
item_hash = hashlib.sha256(item_text.encode()).hexdigest()
|
|
10
17
|
|
|
11
18
|
if "image" in item:
|
|
12
|
-
from PIL import Image
|
|
13
|
-
|
|
14
19
|
image: Image.Image = item["image"]
|
|
15
20
|
item_hash += hashlib.sha256(image.tobytes()).hexdigest()
|
|
16
21
|
|
|
@@ -1,14 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import logging
|
|
5
|
+
import warnings
|
|
3
6
|
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
4
8
|
|
|
5
9
|
import numpy as np
|
|
6
10
|
|
|
7
11
|
from mteb._requires_package import requires_package
|
|
8
|
-
from mteb.types import BatchedInput
|
|
9
12
|
|
|
10
13
|
from ._hash_utils import _hash_item
|
|
11
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
import faiss
|
|
17
|
+
|
|
18
|
+
from mteb.types import BatchedInput
|
|
19
|
+
|
|
12
20
|
logger = logging.getLogger(__name__)
|
|
13
21
|
|
|
14
22
|
|
|
@@ -22,7 +30,6 @@ class FaissCache:
|
|
|
22
30
|
"FAISS-based vector cache",
|
|
23
31
|
install_instruction="pip install mteb[faiss-cpu]",
|
|
24
32
|
)
|
|
25
|
-
import faiss
|
|
26
33
|
|
|
27
34
|
self.directory = Path(directory)
|
|
28
35
|
self.directory.mkdir(parents=True, exist_ok=True)
|
|
@@ -36,7 +43,7 @@ class FaissCache:
|
|
|
36
43
|
logger.info(f"Initialized FAISS VectorCacheMap in {self.directory}")
|
|
37
44
|
self.load()
|
|
38
45
|
|
|
39
|
-
def add(self, items: list[
|
|
46
|
+
def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None:
|
|
40
47
|
"""Add vector to FAISS index."""
|
|
41
48
|
import faiss
|
|
42
49
|
|
|
@@ -71,7 +78,9 @@ class FaissCache:
|
|
|
71
78
|
try:
|
|
72
79
|
return self.index.reconstruct(idx)
|
|
73
80
|
except Exception:
|
|
74
|
-
|
|
81
|
+
msg = f"Vector id {idx} missing for hash {item_hash}"
|
|
82
|
+
logger.warning(msg)
|
|
83
|
+
warnings.warn(msg)
|
|
75
84
|
return None
|
|
76
85
|
|
|
77
86
|
def save(self) -> None:
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
|
|
7
|
-
from mteb.types import BatchedInput
|
|
8
|
-
|
|
9
9
|
from ._hash_utils import _hash_item
|
|
10
10
|
|
|
11
11
|
logger = logging.getLogger(__name__)
|
|
@@ -14,7 +14,7 @@ logger = logging.getLogger(__name__)
|
|
|
14
14
|
class NumpyCache:
|
|
15
15
|
"""Generic vector cache for both text and images."""
|
|
16
16
|
|
|
17
|
-
def __init__(self, directory: str | Path, initial_vectors: int =
|
|
17
|
+
def __init__(self, directory: str | Path, initial_vectors: int = 100_000):
|
|
18
18
|
self.directory = Path(directory)
|
|
19
19
|
self.directory.mkdir(parents=True, exist_ok=True)
|
|
20
20
|
self.vectors_file = self.directory / "vectors.npy"
|
|
@@ -27,7 +27,7 @@ class NumpyCache:
|
|
|
27
27
|
logger.info(f"Initialized VectorCacheMap in directory: {self.directory}")
|
|
28
28
|
self._initialize_vectors_file()
|
|
29
29
|
|
|
30
|
-
def add(self,
|
|
30
|
+
def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None:
|
|
31
31
|
"""Add a vector to the cache."""
|
|
32
32
|
try:
|
|
33
33
|
if self.vector_dim is None:
|
|
@@ -38,12 +38,17 @@ class NumpyCache:
|
|
|
38
38
|
self._save_dimension()
|
|
39
39
|
logger.info(f"Initialized vector dimension to {self.vector_dim}")
|
|
40
40
|
|
|
41
|
-
|
|
41
|
+
if self.vectors is None:
|
|
42
|
+
raise RuntimeError(
|
|
43
|
+
"Vectors file not initialized. Call _initialize_vectors_file() first."
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
for item, vec in zip(items, vectors):
|
|
42
47
|
item_hash = _hash_item(item)
|
|
43
48
|
if item_hash in self.hash_to_index:
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
)
|
|
49
|
+
msg = f"Hash collision or duplicate item for hash {item_hash}. Overwriting existing vector."
|
|
50
|
+
logger.warning(msg)
|
|
51
|
+
warnings.warn(msg)
|
|
47
52
|
index = self.hash_to_index[item_hash]
|
|
48
53
|
else:
|
|
49
54
|
index = len(self.hash_to_index)
|
|
@@ -74,18 +79,26 @@ class NumpyCache:
|
|
|
74
79
|
shape=(self.initial_vectors, self.vector_dim),
|
|
75
80
|
)
|
|
76
81
|
else:
|
|
77
|
-
self.vectors = np.memmap(
|
|
78
|
-
|
|
82
|
+
self.vectors = np.memmap(
|
|
83
|
+
self.vectors_file,
|
|
84
|
+
dtype="float32",
|
|
85
|
+
mode="r+",
|
|
86
|
+
shape=(-1, self.vector_dim),
|
|
87
|
+
)
|
|
79
88
|
logger.info(f"Vectors file initialized with shape: {self.vectors.shape}")
|
|
80
89
|
|
|
81
90
|
def _double_vectors_file(self) -> None:
|
|
91
|
+
if self.vectors is None or self.vector_dim is None:
|
|
92
|
+
raise RuntimeError(
|
|
93
|
+
"Vectors file not initialized. Call _initialize_vectors_file() first."
|
|
94
|
+
)
|
|
82
95
|
current_size = len(self.vectors)
|
|
83
96
|
new_size = current_size * 2
|
|
84
97
|
logger.info(f"Doubling vectors file from {current_size} to {new_size} vectors")
|
|
85
98
|
self.vectors.flush()
|
|
86
99
|
new_vectors = np.memmap(
|
|
87
|
-
self.vectors_file,
|
|
88
|
-
dtype=
|
|
100
|
+
str(self.vectors_file),
|
|
101
|
+
dtype=np.float32,
|
|
89
102
|
mode="r+",
|
|
90
103
|
shape=(new_size, self.vector_dim),
|
|
91
104
|
)
|
|
@@ -107,9 +120,9 @@ class NumpyCache:
|
|
|
107
120
|
f"Loaded vector dimension {self.vector_dim} from {self.dimension_file}"
|
|
108
121
|
)
|
|
109
122
|
else:
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
)
|
|
123
|
+
msg = "Dimension file not found. Vector dimension remains uninitialized."
|
|
124
|
+
logger.warning(msg)
|
|
125
|
+
warnings.warn(msg)
|
|
113
126
|
|
|
114
127
|
def save(self) -> None:
|
|
115
128
|
"""Persist VectorCacheMap to disk."""
|
|
@@ -146,25 +159,30 @@ class NumpyCache:
|
|
|
146
159
|
|
|
147
160
|
if self.vector_dim is not None:
|
|
148
161
|
self.vectors = np.memmap(
|
|
149
|
-
self.vectors_file,
|
|
162
|
+
self.vectors_file,
|
|
163
|
+
dtype="float32",
|
|
164
|
+
mode="r+",
|
|
165
|
+
shape=(-1, self.vector_dim),
|
|
150
166
|
)
|
|
151
|
-
self.vectors = self.vectors.reshape(-1, self.vector_dim)
|
|
152
167
|
logger.info(f"Loaded vectors file with shape: {self.vectors.shape}")
|
|
153
168
|
else:
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
)
|
|
169
|
+
msg = "Vector dimension not set. Unable to load vectors file."
|
|
170
|
+
logger.warning(msg)
|
|
171
|
+
warnings.warn(msg)
|
|
157
172
|
logger.info(f"Loaded VectorCacheMap from {self.directory}")
|
|
158
173
|
else:
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
)
|
|
174
|
+
msg = "No existing files found. Initialized empty VectorCacheMap."
|
|
175
|
+
logger.warning(msg)
|
|
176
|
+
warnings.warn(msg)
|
|
162
177
|
except Exception as e:
|
|
163
178
|
logger.error(f"Error loading VectorCacheMap: {str(e)}")
|
|
164
179
|
raise
|
|
165
180
|
|
|
166
|
-
def get_vector(self, item:
|
|
181
|
+
def get_vector(self, item: dict[str, Any]) -> np.ndarray | None:
|
|
167
182
|
"""Retrieve vector from index by hash."""
|
|
183
|
+
if self.vectors is None:
|
|
184
|
+
return None
|
|
185
|
+
|
|
168
186
|
try:
|
|
169
187
|
item_hash = _hash_item(item)
|
|
170
188
|
if item_hash not in self.hash_to_index:
|
|
@@ -176,7 +194,7 @@ class NumpyCache:
|
|
|
176
194
|
logger.error(f"Error retrieving vector for item: {str(e)}")
|
|
177
195
|
raise
|
|
178
196
|
|
|
179
|
-
def __contains__(self, item:
|
|
197
|
+
def __contains__(self, item: dict[str, Any]) -> bool:
|
|
180
198
|
return _hash_item(item) in self.hash_to_index
|
|
181
199
|
|
|
182
200
|
def __del__(self):
|
|
@@ -1,21 +1,26 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
from pathlib import Path
|
|
3
|
-
from typing import Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
import torch
|
|
7
9
|
from datasets import Dataset
|
|
8
|
-
from torch.utils.data import DataLoader
|
|
9
10
|
|
|
10
11
|
from mteb._create_dataloaders import create_dataloader
|
|
11
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
12
|
-
from mteb.models.cache_wrappers.cache_backend_protocol import (
|
|
13
|
-
CacheBackendProtocol,
|
|
14
|
-
)
|
|
15
12
|
from mteb.models.cache_wrappers.cache_backends.numpy_cache import NumpyCache
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
from
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from torch.utils.data import DataLoader
|
|
16
|
+
|
|
17
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
18
|
+
from mteb.models.cache_wrappers.cache_backend_protocol import (
|
|
19
|
+
CacheBackendProtocol,
|
|
20
|
+
)
|
|
21
|
+
from mteb.models.model_meta import ModelMeta
|
|
22
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
23
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
19
24
|
|
|
20
25
|
logger = logging.getLogger(__name__)
|
|
21
26
|
|
|
@@ -90,9 +95,9 @@ class CachedEmbeddingWrapper:
|
|
|
90
95
|
try:
|
|
91
96
|
cache = self._get_or_create_cache(task_name)
|
|
92
97
|
|
|
93
|
-
uncached_items: list[
|
|
98
|
+
uncached_items: list[dict[str, Any]] = []
|
|
94
99
|
uncached_indices: list[int] = []
|
|
95
|
-
all_items = inputs.dataset
|
|
100
|
+
all_items: Dataset = inputs.dataset
|
|
96
101
|
cached_vectors: dict[int, np.ndarray] = {}
|
|
97
102
|
|
|
98
103
|
for i, item in enumerate(all_items):
|
mteb/models/get_model_meta.py
CHANGED
|
@@ -1,15 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import difflib
|
|
2
4
|
import logging
|
|
3
|
-
from
|
|
4
|
-
from typing import Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
5
6
|
|
|
6
|
-
from mteb.abstasks import AbsTask
|
|
7
7
|
from mteb.models import (
|
|
8
8
|
ModelMeta,
|
|
9
|
-
MTEBModels,
|
|
10
9
|
)
|
|
11
10
|
from mteb.models.model_implementations import MODEL_REGISTRY
|
|
12
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Iterable
|
|
14
|
+
|
|
15
|
+
from mteb.abstasks import AbsTask
|
|
16
|
+
from mteb.models import (
|
|
17
|
+
MTEBModels,
|
|
18
|
+
)
|
|
19
|
+
|
|
13
20
|
logger = logging.getLogger(__name__)
|
|
14
21
|
|
|
15
22
|
|
|
@@ -21,6 +28,7 @@ def get_model_metas(
|
|
|
21
28
|
n_parameters_range: tuple[int | None, int | None] = (None, None),
|
|
22
29
|
use_instructions: bool | None = None,
|
|
23
30
|
zero_shot_on: list[AbsTask] | None = None,
|
|
31
|
+
model_types: Iterable[str] | None = None,
|
|
24
32
|
) -> list[ModelMeta]:
|
|
25
33
|
"""Load all models' metadata that fit the specified criteria.
|
|
26
34
|
|
|
@@ -33,6 +41,7 @@ def get_model_metas(
|
|
|
33
41
|
If (None, None), this filter is ignored.
|
|
34
42
|
use_instructions: Whether to filter by models that use instructions. If None, all models are included.
|
|
35
43
|
zero_shot_on: A list of tasks on which the model is zero-shot. If None this filter is ignored.
|
|
44
|
+
model_types: A list of model types to filter by. If None, all model types are included.
|
|
36
45
|
|
|
37
46
|
Returns:
|
|
38
47
|
A list of model metadata objects that fit the specified criteria.
|
|
@@ -41,6 +50,7 @@ def get_model_metas(
|
|
|
41
50
|
model_names = set(model_names) if model_names is not None else None
|
|
42
51
|
languages = set(languages) if languages is not None else None
|
|
43
52
|
frameworks = set(frameworks) if frameworks is not None else None
|
|
53
|
+
model_types_set = set(model_types) if model_types is not None else None
|
|
44
54
|
for model_meta in MODEL_REGISTRY.values():
|
|
45
55
|
if (model_names is not None) and (model_meta.name not in model_names):
|
|
46
56
|
continue
|
|
@@ -57,6 +67,10 @@ def get_model_metas(
|
|
|
57
67
|
model_meta.use_instructions != use_instructions
|
|
58
68
|
):
|
|
59
69
|
continue
|
|
70
|
+
if model_types_set is not None and not model_types_set.intersection(
|
|
71
|
+
model_meta.model_type
|
|
72
|
+
):
|
|
73
|
+
continue
|
|
60
74
|
|
|
61
75
|
lower, upper = n_parameters_range
|
|
62
76
|
n_parameters = model_meta.n_parameters
|
|
@@ -75,7 +89,10 @@ def get_model_metas(
|
|
|
75
89
|
|
|
76
90
|
|
|
77
91
|
def get_model(
|
|
78
|
-
model_name: str,
|
|
92
|
+
model_name: str,
|
|
93
|
+
revision: str | None = None,
|
|
94
|
+
device: str | None = None,
|
|
95
|
+
**kwargs: Any,
|
|
79
96
|
) -> MTEBModels:
|
|
80
97
|
"""A function to fetch and load model object by name.
|
|
81
98
|
|
|
@@ -85,20 +102,31 @@ def get_model(
|
|
|
85
102
|
Args:
|
|
86
103
|
model_name: Name of the model to fetch
|
|
87
104
|
revision: Revision of the model to fetch
|
|
105
|
+
device: Device used to load the model
|
|
88
106
|
**kwargs: Additional keyword arguments to pass to the model loader
|
|
89
107
|
|
|
90
108
|
Returns:
|
|
91
109
|
A model object
|
|
92
110
|
"""
|
|
93
111
|
meta = get_model_meta(model_name, revision)
|
|
94
|
-
model = meta.load_model(**kwargs)
|
|
112
|
+
model = meta.load_model(device=device, **kwargs)
|
|
113
|
+
|
|
114
|
+
if kwargs:
|
|
115
|
+
logger.info(
|
|
116
|
+
f"Model '{model_name}' loaded with additional arguments: {list(kwargs.keys())}"
|
|
117
|
+
)
|
|
118
|
+
meta = meta.model_copy(deep=True)
|
|
119
|
+
meta.loader_kwargs |= kwargs
|
|
95
120
|
|
|
96
|
-
model.mteb_model_meta = meta # type: ignore
|
|
121
|
+
model.mteb_model_meta = meta # type: ignore[misc]
|
|
97
122
|
return model
|
|
98
123
|
|
|
99
124
|
|
|
100
125
|
def get_model_meta(
|
|
101
|
-
model_name: str,
|
|
126
|
+
model_name: str,
|
|
127
|
+
revision: str | None = None,
|
|
128
|
+
fetch_from_hf: bool = True,
|
|
129
|
+
fill_missing: bool = False,
|
|
102
130
|
) -> ModelMeta:
|
|
103
131
|
"""A function to fetch a model metadata object by name.
|
|
104
132
|
|
|
@@ -106,6 +134,7 @@ def get_model_meta(
|
|
|
106
134
|
model_name: Name of the model to fetch
|
|
107
135
|
revision: Revision of the model to fetch
|
|
108
136
|
fetch_from_hf: Whether to fetch the model from HuggingFace Hub if not found in the registry
|
|
137
|
+
fill_missing: Computes missing attributes from the metadata including number of parameters and memory usage.
|
|
109
138
|
|
|
110
139
|
Returns:
|
|
111
140
|
A model metadata object
|
|
@@ -117,10 +146,25 @@ def get_model_meta(
|
|
|
117
146
|
raise ValueError(
|
|
118
147
|
f"Model revision {revision} not found for model {model_name}. Expected {model_meta.revision}."
|
|
119
148
|
)
|
|
149
|
+
|
|
150
|
+
if fill_missing and fetch_from_hf:
|
|
151
|
+
original_meta_dict = model_meta.model_dump()
|
|
152
|
+
new_meta = ModelMeta.from_hub(model_name)
|
|
153
|
+
new_meta_dict = new_meta.model_dump(exclude_none=True)
|
|
154
|
+
|
|
155
|
+
updates = {
|
|
156
|
+
k: v
|
|
157
|
+
for k, v in new_meta_dict.items()
|
|
158
|
+
if original_meta_dict.get(k) is None
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if updates:
|
|
162
|
+
return model_meta.model_copy(update=updates)
|
|
120
163
|
return model_meta
|
|
164
|
+
|
|
121
165
|
if fetch_from_hf:
|
|
122
166
|
logger.info(
|
|
123
|
-
"Model not found in model registry. Attempting to extract metadata by loading the model ({model_name}) using HuggingFace."
|
|
167
|
+
f"Model not found in model registry. Attempting to extract metadata by loading the model ({model_name}) using HuggingFace."
|
|
124
168
|
)
|
|
125
169
|
meta = ModelMeta.from_hub(model_name, revision)
|
|
126
170
|
return meta
|