mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +78 -30
- mteb/_evaluators/any_sts_evaluator.py +13 -6
- mteb/_evaluators/clustering_evaluator.py +13 -5
- mteb/_evaluators/evaluator.py +12 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
- mteb/_evaluators/pair_classification_evaluator.py +17 -7
- mteb/_evaluators/retrieval_evaluator.py +23 -14
- mteb/_evaluators/retrieval_metrics.py +26 -19
- mteb/_evaluators/sklearn_evaluator.py +27 -17
- mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
- mteb/_evaluators/text/summarization_evaluator.py +31 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +9 -3
- mteb/abstasks/_data_filter/task_pipelines.py +10 -2
- mteb/abstasks/_statistics_calculation.py +21 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +78 -44
- mteb/abstasks/aggregate_task_metadata.py +21 -18
- mteb/abstasks/aggregated_task.py +23 -35
- mteb/abstasks/classification.py +39 -18
- mteb/abstasks/clustering.py +37 -20
- mteb/abstasks/clustering_legacy.py +30 -16
- mteb/abstasks/image/image_text_pair_classification.py +26 -9
- mteb/abstasks/multilabel_classification.py +33 -21
- mteb/abstasks/pair_classification.py +44 -19
- mteb/abstasks/regression.py +18 -10
- mteb/abstasks/retrieval.py +82 -52
- mteb/abstasks/retrieval_dataset_loaders.py +50 -39
- mteb/abstasks/sts.py +34 -15
- mteb/abstasks/task_metadata.py +44 -37
- mteb/abstasks/text/bitext_mining.py +57 -35
- mteb/abstasks/text/reranking.py +10 -8
- mteb/abstasks/text/summarization.py +26 -10
- mteb/abstasks/zeroshot_classification.py +27 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +15 -3
- mteb/benchmarks/benchmarks/__init__.py +6 -0
- mteb/benchmarks/benchmarks/benchmarks.py +153 -13
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +189 -31
- mteb/cli/_display_tasks.py +10 -4
- mteb/cli/build_cli.py +112 -13
- mteb/cli/generate_model_card.py +50 -23
- mteb/deprecated_evaluator.py +72 -54
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +71 -47
- mteb/filter_tasks.py +36 -32
- mteb/get_tasks.py +37 -33
- mteb/languages/language_scripts.py +11 -4
- mteb/leaderboard/app.py +172 -37
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +20 -14
- mteb/models/abs_encoder.py +30 -16
- mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +16 -11
- mteb/models/get_model_meta.py +53 -9
- mteb/models/instruct_wrapper.py +41 -13
- mteb/models/model_implementations/align_models.py +11 -5
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +6 -4
- mteb/models/model_implementations/ara_models.py +2 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +85 -22
- mteb/models/model_implementations/bica_model.py +4 -3
- mteb/models/model_implementations/blip2_models.py +13 -6
- mteb/models/model_implementations/blip_models.py +33 -20
- mteb/models/model_implementations/bm25.py +27 -17
- mteb/models/model_implementations/bmretriever_models.py +16 -6
- mteb/models/model_implementations/cadet_models.py +2 -1
- mteb/models/model_implementations/cde_models.py +22 -9
- mteb/models/model_implementations/clip_models.py +18 -10
- mteb/models/model_implementations/clips_models.py +6 -3
- mteb/models/model_implementations/codefuse_models.py +10 -5
- mteb/models/model_implementations/codesage_models.py +6 -3
- mteb/models/model_implementations/cohere_models.py +19 -9
- mteb/models/model_implementations/cohere_v.py +16 -6
- mteb/models/model_implementations/colpali_models.py +10 -6
- mteb/models/model_implementations/colqwen_models.py +24 -38
- mteb/models/model_implementations/colsmol_models.py +5 -3
- mteb/models/model_implementations/conan_models.py +12 -5
- mteb/models/model_implementations/dino_models.py +70 -46
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +18 -9
- mteb/models/model_implementations/e5_v.py +16 -10
- mteb/models/model_implementations/eagerworks_models.py +12 -5
- mteb/models/model_implementations/emillykkejensen_models.py +9 -6
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +3 -2
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +18 -9
- mteb/models/model_implementations/facebookai.py +16 -2
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +13 -8
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
- mteb/models/model_implementations/gritlm_models.py +5 -2
- mteb/models/model_implementations/gte_models.py +34 -13
- mteb/models/model_implementations/hinvec_models.py +7 -2
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +16 -7
- mteb/models/model_implementations/jina_clip.py +58 -14
- mteb/models/model_implementations/jina_models.py +35 -16
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +13 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
- mteb/models/model_implementations/kfst.py +2 -1
- mteb/models/model_implementations/kowshik24_models.py +2 -1
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +8 -2
- mteb/models/model_implementations/listconranker.py +11 -5
- mteb/models/model_implementations/llm2clip_models.py +18 -10
- mteb/models/model_implementations/llm2vec_models.py +28 -14
- mteb/models/model_implementations/mcinext_models.py +12 -3
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +131 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
- mteb/models/model_implementations/mme5_models.py +3 -2
- mteb/models/model_implementations/moco_models.py +15 -8
- mteb/models/model_implementations/mod_models.py +3 -2
- mteb/models/model_implementations/model2vec_models.py +37 -18
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +6 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
- mteb/models/model_implementations/nomic_models.py +47 -19
- mteb/models/model_implementations/nomic_models_vision.py +6 -4
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
- mteb/models/model_implementations/nvidia_models.py +165 -22
- mteb/models/model_implementations/octen_models.py +64 -3
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +30 -17
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
- mteb/models/model_implementations/ops_moa_models.py +10 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
- mteb/models/model_implementations/pawan_models.py +2 -1
- mteb/models/model_implementations/piccolo_models.py +3 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +20 -10
- mteb/models/model_implementations/pylate_models.py +41 -21
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +14 -4
- mteb/models/model_implementations/qzhou_models.py +4 -2
- mteb/models/model_implementations/random_baseline.py +7 -6
- mteb/models/model_implementations/rasgaard_models.py +3 -2
- mteb/models/model_implementations/reasonir_model.py +66 -1
- mteb/models/model_implementations/repllama_models.py +18 -9
- mteb/models/model_implementations/rerankers_custom.py +25 -10
- mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +40 -20
- mteb/models/model_implementations/ruri_models.py +20 -10
- mteb/models/model_implementations/salesforce_models.py +13 -4
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +142 -22
- mteb/models/model_implementations/shuu_model.py +2 -1
- mteb/models/model_implementations/siglip_models.py +39 -24
- mteb/models/model_implementations/slm_models.py +419 -0
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +4 -2
- mteb/models/model_implementations/text2vec_models.py +12 -3
- mteb/models/model_implementations/ua_sentence_models.py +2 -1
- mteb/models/model_implementations/uae_models.py +17 -5
- mteb/models/model_implementations/vdr_models.py +9 -2
- mteb/models/model_implementations/vi_vn_models.py +12 -6
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +14 -7
- mteb/models/model_implementations/voyage_models.py +136 -4
- mteb/models/model_implementations/voyage_v.py +17 -10
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +2 -1
- mteb/models/model_implementations/yuan_models_en.py +3 -2
- mteb/models/model_meta.py +127 -40
- mteb/models/models_protocols.py +43 -22
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
- mteb/models/search_wrappers.py +63 -29
- mteb/models/sentence_transformer_wrapper.py +52 -26
- mteb/models/vllm_wrapper.py +329 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +48 -35
- mteb/results/model_result.py +68 -32
- mteb/results/task_result.py +110 -72
- mteb/similarity_functions.py +19 -9
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +2 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +16 -16
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +44 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +3 -3
- mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +13 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +18 -5
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
mteb/models/model_meta.py
CHANGED
|
@@ -3,42 +3,52 @@ from __future__ import annotations
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
import warnings
|
|
6
|
-
from collections.abc import Callable
|
|
6
|
+
from collections.abc import Callable
|
|
7
7
|
from dataclasses import field
|
|
8
8
|
from enum import Enum
|
|
9
9
|
from functools import partial
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
12
12
|
|
|
13
|
+
import numpy as np
|
|
13
14
|
from huggingface_hub import (
|
|
14
|
-
GitCommitInfo,
|
|
15
15
|
ModelCard,
|
|
16
|
-
ModelCardData,
|
|
17
16
|
get_safetensors_metadata,
|
|
18
17
|
hf_hub_download,
|
|
19
18
|
list_repo_commits,
|
|
19
|
+
model_info,
|
|
20
20
|
repo_exists,
|
|
21
21
|
)
|
|
22
22
|
from huggingface_hub.errors import (
|
|
23
23
|
EntryNotFoundError,
|
|
24
24
|
GatedRepoError,
|
|
25
|
+
HFValidationError,
|
|
25
26
|
NotASafetensorsRepoError,
|
|
26
27
|
RepositoryNotFoundError,
|
|
27
28
|
SafetensorsParsingError,
|
|
28
29
|
)
|
|
29
30
|
from pydantic import BaseModel, ConfigDict, field_validator, model_validator
|
|
31
|
+
from sentence_transformers.models import Transformer
|
|
32
|
+
from torch import nn
|
|
30
33
|
from transformers import AutoConfig
|
|
31
|
-
from typing_extensions import Self
|
|
32
34
|
|
|
33
35
|
from mteb._helpful_enum import HelpfulStrEnum
|
|
34
36
|
from mteb.languages import check_language_code
|
|
35
|
-
from mteb.models.models_protocols import
|
|
37
|
+
from mteb.models.models_protocols import MTEBModels
|
|
36
38
|
from mteb.types import ISOLanguageScript, Licenses, Modalities, StrDate, StrURL
|
|
37
39
|
|
|
38
40
|
if TYPE_CHECKING:
|
|
41
|
+
from collections.abc import Sequence
|
|
42
|
+
|
|
43
|
+
from huggingface_hub import (
|
|
44
|
+
GitCommitInfo,
|
|
45
|
+
ModelCardData,
|
|
46
|
+
)
|
|
39
47
|
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
48
|
+
from typing_extensions import Self
|
|
40
49
|
|
|
41
50
|
from mteb.abstasks import AbsTask
|
|
51
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
42
52
|
|
|
43
53
|
|
|
44
54
|
logger = logging.getLogger(__name__)
|
|
@@ -55,6 +65,10 @@ FRAMEWORKS = Literal[
|
|
|
55
65
|
"PyLate",
|
|
56
66
|
"ColBERT",
|
|
57
67
|
"ColPali",
|
|
68
|
+
"GGUF",
|
|
69
|
+
"safetensors",
|
|
70
|
+
"ONNX",
|
|
71
|
+
"Transformers",
|
|
58
72
|
]
|
|
59
73
|
|
|
60
74
|
MODEL_TYPES = Literal["dense", "cross-encoder", "late-interaction"]
|
|
@@ -81,9 +95,6 @@ def _get_loader_name(
|
|
|
81
95
|
return loader.__name__
|
|
82
96
|
|
|
83
97
|
|
|
84
|
-
_SENTENCE_TRANSFORMER_LIB_NAME = "Sentence Transformers"
|
|
85
|
-
|
|
86
|
-
|
|
87
98
|
class ModelMeta(BaseModel):
|
|
88
99
|
"""The model metadata object.
|
|
89
100
|
|
|
@@ -91,8 +102,9 @@ class ModelMeta(BaseModel):
|
|
|
91
102
|
loader: The function that loads the model. If None it assumes that the model is not implemented.
|
|
92
103
|
loader_kwargs: The keyword arguments to pass to the loader function.
|
|
93
104
|
name: The name of the model, ideally the name on huggingface. It should be in the format "organization/model_name".
|
|
94
|
-
n_parameters: The number of parameters in the model, e.g. 7_000_000 for a 7M parameter model. Can be
|
|
95
|
-
|
|
105
|
+
n_parameters: The total number of parameters in the model, e.g. `7_000_000` for a 7M parameter model. Can be none in case the number of parameters is unknown.
|
|
106
|
+
n_embedding_parameters: The number of parameters used for the embedding layer. Can be None if the number of embedding parameters is not known (e.g. for proprietary models).
|
|
107
|
+
n_active_parameters_override: The number of active parameters used bu model. Should be used **only** for Mixture of Experts models.
|
|
96
108
|
memory_usage_mb: The memory usage of the model in MB. Can be None if the memory usage is not known (e.g. for proprietary models). To calculate it use the `calculate_memory_usage_mb` method.
|
|
97
109
|
max_tokens: The maximum number of tokens the model can handle. Can be None if the maximum number of tokens is not known (e.g. for proprietary
|
|
98
110
|
models).
|
|
@@ -131,6 +143,8 @@ class ModelMeta(BaseModel):
|
|
|
131
143
|
release_date: StrDate | None
|
|
132
144
|
languages: list[ISOLanguageScript] | None
|
|
133
145
|
n_parameters: int | None
|
|
146
|
+
n_active_parameters_override: int | None = None
|
|
147
|
+
n_embedding_parameters: int | None = None
|
|
134
148
|
memory_usage_mb: float | None
|
|
135
149
|
max_tokens: float | None
|
|
136
150
|
embed_dim: int | None
|
|
@@ -189,6 +203,16 @@ class ModelMeta(BaseModel):
|
|
|
189
203
|
"""
|
|
190
204
|
return "cross-encoder" in self.model_type
|
|
191
205
|
|
|
206
|
+
@property
|
|
207
|
+
def n_active_parameters(self):
|
|
208
|
+
"""Number of active parameters. Assumed to be `n_parameters - n_embedding_parameters`. Can be overwritten using `n_active_parameters_override` e.g. for MoE models."""
|
|
209
|
+
if self.n_active_parameters_override is not None:
|
|
210
|
+
return self.n_active_parameters_override
|
|
211
|
+
|
|
212
|
+
if self.n_parameters is not None and self.n_embedding_parameters is not None:
|
|
213
|
+
return self.n_parameters - self.n_embedding_parameters
|
|
214
|
+
return None
|
|
215
|
+
|
|
192
216
|
@field_validator("similarity_fn_name", mode="before")
|
|
193
217
|
@classmethod
|
|
194
218
|
def _validate_similarity_fn_name(cls, value: str) -> ScoringFunction | None:
|
|
@@ -250,7 +274,7 @@ class ModelMeta(BaseModel):
|
|
|
250
274
|
)
|
|
251
275
|
return v
|
|
252
276
|
|
|
253
|
-
def load_model(self, **kwargs: Any) -> MTEBModels:
|
|
277
|
+
def load_model(self, device: str | None = None, **kwargs: Any) -> MTEBModels:
|
|
254
278
|
"""Loads the model using the specified loader function."""
|
|
255
279
|
if self.loader is None:
|
|
256
280
|
raise NotImplementedError(
|
|
@@ -262,11 +286,11 @@ class ModelMeta(BaseModel):
|
|
|
262
286
|
# Allow overwrites
|
|
263
287
|
_kwargs = self.loader_kwargs.copy()
|
|
264
288
|
_kwargs.update(kwargs)
|
|
289
|
+
if device is not None:
|
|
290
|
+
_kwargs["device"] = device
|
|
265
291
|
|
|
266
|
-
model:
|
|
267
|
-
|
|
268
|
-
)
|
|
269
|
-
model.mteb_model_meta = self # type: ignore
|
|
292
|
+
model: MTEBModels = self.loader(self.name, revision=self.revision, **_kwargs)
|
|
293
|
+
model.mteb_model_meta = self # type: ignore[misc]
|
|
270
294
|
return model
|
|
271
295
|
|
|
272
296
|
def model_name_as_path(self) -> str:
|
|
@@ -307,7 +331,7 @@ class ModelMeta(BaseModel):
|
|
|
307
331
|
embedding_dim = None
|
|
308
332
|
max_tokens = None
|
|
309
333
|
|
|
310
|
-
if model_name and compute_metadata and
|
|
334
|
+
if model_name and compute_metadata and _repo_exists(model_name):
|
|
311
335
|
reference = "https://huggingface.co/" + model_name
|
|
312
336
|
card = ModelCard.load(model_name)
|
|
313
337
|
card_data: ModelCardData = card.data
|
|
@@ -318,22 +342,17 @@ class ModelMeta(BaseModel):
|
|
|
318
342
|
model_config = None
|
|
319
343
|
logger.warning(f"Can't get configuration for {model_name}. Error: {e}")
|
|
320
344
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
)
|
|
325
|
-
frameworks.append(_SENTENCE_TRANSFORMER_LIB_NAME)
|
|
326
|
-
else:
|
|
327
|
-
msg = "Model library not recognized, defaulting to Sentence Transformers loader."
|
|
328
|
-
logger.warning(msg)
|
|
329
|
-
warnings.warn(msg)
|
|
345
|
+
hf_frameworks = (
|
|
346
|
+
cls._get_frameworks_from_hf_tags(model_name) if model_name else []
|
|
347
|
+
)
|
|
348
|
+
frameworks.extend(hf_frameworks)
|
|
330
349
|
|
|
331
350
|
if revision is None:
|
|
332
351
|
revisions = _get_repo_commits(model_name, "model")
|
|
333
352
|
revision = revisions[0].commit_id if revisions else None
|
|
334
353
|
|
|
335
354
|
release_date = cls.fetch_release_date(model_name)
|
|
336
|
-
model_license = card_data.license
|
|
355
|
+
model_license = card_data.license if card_data.license != "other" else None
|
|
337
356
|
n_parameters = cls._calculate_num_parameters_from_hub(model_name)
|
|
338
357
|
memory_usage_mb = cls._calculate_memory_usage_mb(model_name, n_parameters)
|
|
339
358
|
if model_config and hasattr(model_config, "hidden_size"):
|
|
@@ -386,8 +405,14 @@ class ModelMeta(BaseModel):
|
|
|
386
405
|
else model.model_card_data.base_model
|
|
387
406
|
)
|
|
388
407
|
meta = cls._from_hub(name, revision, compute_metadata)
|
|
389
|
-
|
|
390
|
-
|
|
408
|
+
try:
|
|
409
|
+
first = model[0]
|
|
410
|
+
|
|
411
|
+
if isinstance(first, Transformer):
|
|
412
|
+
emb = first.auto_model.get_input_embeddings()
|
|
413
|
+
meta.n_embedding_parameters = int(np.prod(emb.weight.shape))
|
|
414
|
+
except Exception as e:
|
|
415
|
+
logger.warning(f"Could not calculate embedding parameters for {name}: {e}")
|
|
391
416
|
meta.revision = model.model_card_data.base_model_revision or meta.revision
|
|
392
417
|
meta.max_tokens = model.max_seq_length
|
|
393
418
|
meta.embed_dim = model.get_sentence_embedding_dimension()
|
|
@@ -413,11 +438,9 @@ class ModelMeta(BaseModel):
|
|
|
413
438
|
The generated ModelMeta.
|
|
414
439
|
"""
|
|
415
440
|
meta = cls._from_hub(model, revision, compute_metadata)
|
|
416
|
-
if _SENTENCE_TRANSFORMER_LIB_NAME not in meta.framework:
|
|
417
|
-
meta.framework.append("Sentence Transformers")
|
|
418
441
|
meta.modalities = ["text"]
|
|
419
442
|
|
|
420
|
-
if model and compute_metadata and
|
|
443
|
+
if model and compute_metadata and _repo_exists(model):
|
|
421
444
|
# have max_seq_length field
|
|
422
445
|
sbert_config = _get_json_from_hub(
|
|
423
446
|
model, "sentence_bert_config.json", "model", revision=revision
|
|
@@ -435,7 +458,7 @@ class ModelMeta(BaseModel):
|
|
|
435
458
|
and config_sbert.get("similarity_fn_name") is not None
|
|
436
459
|
):
|
|
437
460
|
meta.similarity_fn_name = ScoringFunction.from_str(
|
|
438
|
-
config_sbert
|
|
461
|
+
config_sbert["similarity_fn_name"]
|
|
439
462
|
)
|
|
440
463
|
else:
|
|
441
464
|
meta.similarity_fn_name = ScoringFunction.COSINE
|
|
@@ -461,8 +484,15 @@ class ModelMeta(BaseModel):
|
|
|
461
484
|
from mteb.models import CrossEncoderWrapper
|
|
462
485
|
|
|
463
486
|
meta = cls._from_hub(model.model.name_or_path, revision, compute_metadata)
|
|
464
|
-
|
|
465
|
-
|
|
487
|
+
try:
|
|
488
|
+
emb = model.model.get_input_embeddings()
|
|
489
|
+
|
|
490
|
+
if isinstance(emb, nn.Embedding):
|
|
491
|
+
meta.n_embedding_parameters = int(np.prod(emb.weight.shape))
|
|
492
|
+
except Exception as e:
|
|
493
|
+
logger.warning(
|
|
494
|
+
f"Could not calculate embedding parameters for {model.model.name_or_path}: {e}"
|
|
495
|
+
)
|
|
466
496
|
meta.revision = model.config._commit_hash or meta.revision
|
|
467
497
|
meta.loader = CrossEncoderWrapper
|
|
468
498
|
meta.embed_dim = None
|
|
@@ -487,7 +517,7 @@ class ModelMeta(BaseModel):
|
|
|
487
517
|
if isinstance(tasks[0], str):
|
|
488
518
|
benchmark_datasets = set(tasks)
|
|
489
519
|
else:
|
|
490
|
-
tasks = cast(Sequence[
|
|
520
|
+
tasks = cast("Sequence[AbsTask]", tasks)
|
|
491
521
|
benchmark_datasets = set()
|
|
492
522
|
for task in tasks:
|
|
493
523
|
benchmark_datasets.add(task.metadata.name)
|
|
@@ -511,10 +541,12 @@ class ModelMeta(BaseModel):
|
|
|
511
541
|
if adapted_training_datasets is not None:
|
|
512
542
|
training_datasets |= adapted_training_datasets
|
|
513
543
|
except (ValueError, KeyError) as e:
|
|
514
|
-
|
|
544
|
+
msg = f"Could not get source model: {e} in MTEB"
|
|
545
|
+
logger.warning(msg)
|
|
546
|
+
warnings.warn(msg)
|
|
515
547
|
|
|
516
548
|
return_dataset = training_datasets.copy()
|
|
517
|
-
visited = set()
|
|
549
|
+
visited: set[str] = set()
|
|
518
550
|
|
|
519
551
|
for dataset in training_datasets:
|
|
520
552
|
similar_tasks = _collect_similar_tasks(dataset, visited)
|
|
@@ -540,7 +572,7 @@ class ModelMeta(BaseModel):
|
|
|
540
572
|
if isinstance(tasks[0], str):
|
|
541
573
|
benchmark_datasets = set(tasks)
|
|
542
574
|
else:
|
|
543
|
-
tasks = cast(Sequence[
|
|
575
|
+
tasks = cast("Sequence[AbsTask]", tasks)
|
|
544
576
|
benchmark_datasets = {task.metadata.name for task in tasks}
|
|
545
577
|
overlap = training_datasets & benchmark_datasets
|
|
546
578
|
perc_overlap = 100 * (len(overlap) / len(benchmark_datasets))
|
|
@@ -548,6 +580,8 @@ class ModelMeta(BaseModel):
|
|
|
548
580
|
|
|
549
581
|
@staticmethod
|
|
550
582
|
def _calculate_num_parameters_from_hub(model_name: str | None = None) -> int | None:
|
|
583
|
+
if not model_name:
|
|
584
|
+
return None
|
|
551
585
|
try:
|
|
552
586
|
safetensors_metadata = get_safetensors_metadata(model_name)
|
|
553
587
|
if len(safetensors_metadata.parameter_count) >= 0:
|
|
@@ -561,7 +595,7 @@ class ModelMeta(BaseModel):
|
|
|
561
595
|
logger.warning(
|
|
562
596
|
f"Can't calculate number of parameters for {model_name}. Got error {e}"
|
|
563
597
|
)
|
|
564
|
-
|
|
598
|
+
return None
|
|
565
599
|
|
|
566
600
|
def calculate_num_parameters_from_hub(self) -> int | None:
|
|
567
601
|
"""Calculates the number of parameters in the model.
|
|
@@ -624,7 +658,7 @@ class ModelMeta(BaseModel):
|
|
|
624
658
|
if "API" in self.framework or self.name is None:
|
|
625
659
|
return None
|
|
626
660
|
|
|
627
|
-
return self._calculate_memory_usage_mb(self.
|
|
661
|
+
return self._calculate_memory_usage_mb(self.name, self.n_parameters)
|
|
628
662
|
|
|
629
663
|
@staticmethod
|
|
630
664
|
def fetch_release_date(model_name: str) -> StrDate | None:
|
|
@@ -640,6 +674,43 @@ class ModelMeta(BaseModel):
|
|
|
640
674
|
return release_date
|
|
641
675
|
return None
|
|
642
676
|
|
|
677
|
+
@staticmethod
|
|
678
|
+
def _get_frameworks_from_hf_tags(model_name: str) -> list[FRAMEWORKS]:
|
|
679
|
+
"""Extract frameworks supported by the model from HuggingFace model tags.
|
|
680
|
+
|
|
681
|
+
Args:
|
|
682
|
+
model_name: HuggingFace model name
|
|
683
|
+
|
|
684
|
+
Returns:
|
|
685
|
+
List of framework names found in tags. Defaults to empty list if no frameworks found.
|
|
686
|
+
"""
|
|
687
|
+
try:
|
|
688
|
+
info = model_info(model_name)
|
|
689
|
+
if not info.tags:
|
|
690
|
+
return []
|
|
691
|
+
except Exception as e:
|
|
692
|
+
logger.warning(
|
|
693
|
+
f"Failed to fetch frameworks from HuggingFace tags for {model_name}: {e}"
|
|
694
|
+
)
|
|
695
|
+
return []
|
|
696
|
+
|
|
697
|
+
# Mapping from HuggingFace tags to MTEB framework names
|
|
698
|
+
tag_to_framework: dict[str, FRAMEWORKS] = {
|
|
699
|
+
"sentence-transformers": "Sentence Transformers",
|
|
700
|
+
"transformers": "Transformers",
|
|
701
|
+
"onnx": "ONNX",
|
|
702
|
+
"safetensors": "safetensors",
|
|
703
|
+
"gguf": "GGUF",
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
frameworks: list[FRAMEWORKS] = []
|
|
707
|
+
|
|
708
|
+
for framework_tag in tag_to_framework.keys():
|
|
709
|
+
if framework_tag in info.tags:
|
|
710
|
+
frameworks.append(tag_to_framework[framework_tag])
|
|
711
|
+
|
|
712
|
+
return frameworks
|
|
713
|
+
|
|
643
714
|
def to_python(self) -> str:
|
|
644
715
|
"""Returns a string representation of the model."""
|
|
645
716
|
return _pydantic_instance_to_code(self)
|
|
@@ -784,3 +855,19 @@ def _get_file_on_hub(
|
|
|
784
855
|
except (GatedRepoError, RepositoryNotFoundError, EntryNotFoundError) as e:
|
|
785
856
|
logger.warning(f"Can't get file {file_name} of {repo_id}: {e}")
|
|
786
857
|
return None
|
|
858
|
+
|
|
859
|
+
|
|
860
|
+
def _repo_exists(repo_id: str, repo_type: str | None = None) -> bool:
|
|
861
|
+
"""Checks if a repository exists on HuggingFace Hub.
|
|
862
|
+
|
|
863
|
+
Repo exists will raise HFValidationError for invalid local paths
|
|
864
|
+
|
|
865
|
+
Args:
|
|
866
|
+
repo_id: The repository ID.
|
|
867
|
+
repo_type: The type of repository (e.g., "model", "dataset", "space").
|
|
868
|
+
"""
|
|
869
|
+
try:
|
|
870
|
+
return repo_exists(repo_id=repo_id, repo_type=repo_type)
|
|
871
|
+
except HFValidationError as e:
|
|
872
|
+
logger.warning(f"Can't check existence of {repo_id}: {e}")
|
|
873
|
+
return False
|
mteb/models/models_protocols.py
CHANGED
|
@@ -1,20 +1,23 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
from torch.utils.data import DataLoader
|
|
1
|
+
from __future__ import annotations
|
|
4
2
|
|
|
5
|
-
from
|
|
6
|
-
from mteb.types import (
|
|
7
|
-
Array,
|
|
8
|
-
BatchedInput,
|
|
9
|
-
CorpusDatasetType,
|
|
10
|
-
PromptType,
|
|
11
|
-
QueryDatasetType,
|
|
12
|
-
RetrievalOutputType,
|
|
13
|
-
TopRankedDocumentsType,
|
|
14
|
-
)
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
|
|
15
4
|
|
|
16
5
|
if TYPE_CHECKING:
|
|
6
|
+
from torch.utils.data import DataLoader
|
|
7
|
+
from typing_extensions import Unpack
|
|
8
|
+
|
|
9
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
10
|
from mteb.models.model_meta import ModelMeta
|
|
11
|
+
from mteb.types import (
|
|
12
|
+
Array,
|
|
13
|
+
BatchedInput,
|
|
14
|
+
CorpusDatasetType,
|
|
15
|
+
EncodeKwargs,
|
|
16
|
+
PromptType,
|
|
17
|
+
QueryDatasetType,
|
|
18
|
+
RetrievalOutputType,
|
|
19
|
+
TopRankedDocumentsType,
|
|
20
|
+
)
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
@runtime_checkable
|
|
@@ -28,7 +31,8 @@ class SearchProtocol(Protocol):
|
|
|
28
31
|
task_metadata: TaskMetadata,
|
|
29
32
|
hf_split: str,
|
|
30
33
|
hf_subset: str,
|
|
31
|
-
encode_kwargs:
|
|
34
|
+
encode_kwargs: EncodeKwargs,
|
|
35
|
+
num_proc: int,
|
|
32
36
|
) -> None:
|
|
33
37
|
"""Index the corpus for retrieval.
|
|
34
38
|
|
|
@@ -38,6 +42,7 @@ class SearchProtocol(Protocol):
|
|
|
38
42
|
hf_split: Split of current task, allows to know some additional information about current split.
|
|
39
43
|
hf_subset: Subset of current task. Similar to `hf_split` to get more information
|
|
40
44
|
encode_kwargs: Additional arguments to pass to the encoder during indexing.
|
|
45
|
+
num_proc: Number of processes to use for dataloading.
|
|
41
46
|
"""
|
|
42
47
|
...
|
|
43
48
|
|
|
@@ -49,8 +54,9 @@ class SearchProtocol(Protocol):
|
|
|
49
54
|
hf_split: str,
|
|
50
55
|
hf_subset: str,
|
|
51
56
|
top_k: int,
|
|
52
|
-
encode_kwargs:
|
|
57
|
+
encode_kwargs: EncodeKwargs,
|
|
53
58
|
top_ranked: TopRankedDocumentsType | None = None,
|
|
59
|
+
num_proc: int,
|
|
54
60
|
) -> RetrievalOutputType:
|
|
55
61
|
"""Search the corpus using the given queries.
|
|
56
62
|
|
|
@@ -63,6 +69,7 @@ class SearchProtocol(Protocol):
|
|
|
63
69
|
Passed only from Reranking tasks.
|
|
64
70
|
top_k: Number of top documents to return for each query.
|
|
65
71
|
encode_kwargs: Additional arguments to pass to the encoder during indexing.
|
|
72
|
+
num_proc: Number of processes to use for dataloading.
|
|
66
73
|
|
|
67
74
|
Returns:
|
|
68
75
|
Dictionary with query IDs as keys with dict as values, where each value is a mapping of document IDs to their relevance scores.
|
|
@@ -70,7 +77,7 @@ class SearchProtocol(Protocol):
|
|
|
70
77
|
...
|
|
71
78
|
|
|
72
79
|
@property
|
|
73
|
-
def mteb_model_meta(self) ->
|
|
80
|
+
def mteb_model_meta(self) -> ModelMeta:
|
|
74
81
|
"""Metadata of the model"""
|
|
75
82
|
...
|
|
76
83
|
|
|
@@ -83,12 +90,19 @@ class EncoderProtocol(Protocol):
|
|
|
83
90
|
In general the interface is kept aligned with sentence-transformers interface. In cases where exceptions occurs these are handled within MTEB.
|
|
84
91
|
"""
|
|
85
92
|
|
|
86
|
-
def __init__(
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
model_name: str,
|
|
96
|
+
revision: str | None,
|
|
97
|
+
device: str | None = None,
|
|
98
|
+
**kwargs: Any,
|
|
99
|
+
) -> None:
|
|
87
100
|
"""The initialization function for the encoder. Used when calling it from the mteb run CLI.
|
|
88
101
|
|
|
89
102
|
Args:
|
|
90
103
|
model_name: Name of the model
|
|
91
104
|
revision: revision of the model
|
|
105
|
+
device: Device used to load the model
|
|
92
106
|
kwargs: Any additional kwargs
|
|
93
107
|
"""
|
|
94
108
|
...
|
|
@@ -101,7 +115,7 @@ class EncoderProtocol(Protocol):
|
|
|
101
115
|
hf_split: str,
|
|
102
116
|
hf_subset: str,
|
|
103
117
|
prompt_type: PromptType | None = None,
|
|
104
|
-
**kwargs:
|
|
118
|
+
**kwargs: Unpack[EncodeKwargs],
|
|
105
119
|
) -> Array:
|
|
106
120
|
"""Encodes the given sentences using the encoder.
|
|
107
121
|
|
|
@@ -168,7 +182,7 @@ class EncoderProtocol(Protocol):
|
|
|
168
182
|
...
|
|
169
183
|
|
|
170
184
|
@property
|
|
171
|
-
def mteb_model_meta(self) ->
|
|
185
|
+
def mteb_model_meta(self) -> ModelMeta:
|
|
172
186
|
"""Metadata of the model"""
|
|
173
187
|
...
|
|
174
188
|
|
|
@@ -181,12 +195,19 @@ class CrossEncoderProtocol(Protocol):
|
|
|
181
195
|
In general the interface is kept aligned with sentence-transformers interface. In cases where exceptions occurs these are handled within MTEB.
|
|
182
196
|
"""
|
|
183
197
|
|
|
184
|
-
def __init__(
|
|
198
|
+
def __init__(
|
|
199
|
+
self,
|
|
200
|
+
model_name: str,
|
|
201
|
+
revision: str | None,
|
|
202
|
+
device: str | None = None,
|
|
203
|
+
**kwargs: Any,
|
|
204
|
+
) -> None:
|
|
185
205
|
"""The initialization function for the encoder. Used when calling it from the mteb run CLI.
|
|
186
206
|
|
|
187
207
|
Args:
|
|
188
208
|
model_name: Name of the model
|
|
189
209
|
revision: revision of the model
|
|
210
|
+
device: Device used to load the model
|
|
190
211
|
kwargs: Any additional kwargs
|
|
191
212
|
"""
|
|
192
213
|
...
|
|
@@ -200,7 +221,7 @@ class CrossEncoderProtocol(Protocol):
|
|
|
200
221
|
hf_split: str,
|
|
201
222
|
hf_subset: str,
|
|
202
223
|
prompt_type: PromptType | None = None,
|
|
203
|
-
**kwargs:
|
|
224
|
+
**kwargs: Unpack[EncodeKwargs],
|
|
204
225
|
) -> Array:
|
|
205
226
|
"""Predicts relevance scores for pairs of inputs. Note that, unlike the encoder, the cross-encoder can compare across inputs.
|
|
206
227
|
|
|
@@ -220,7 +241,7 @@ class CrossEncoderProtocol(Protocol):
|
|
|
220
241
|
...
|
|
221
242
|
|
|
222
243
|
@property
|
|
223
|
-
def mteb_model_meta(self) ->
|
|
244
|
+
def mteb_model_meta(self) -> ModelMeta:
|
|
224
245
|
"""Metadata of the model"""
|
|
225
246
|
...
|
|
226
247
|
|
|
@@ -1,7 +1,11 @@
|
|
|
1
|
-
from
|
|
2
|
-
from typing import Protocol
|
|
1
|
+
from __future__ import annotations
|
|
3
2
|
|
|
4
|
-
from
|
|
3
|
+
from typing import TYPE_CHECKING, Protocol
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
|
|
8
|
+
from mteb.types import Array, TopRankedDocumentsType
|
|
5
9
|
|
|
6
10
|
|
|
7
11
|
class IndexEncoderSearchProtocol(Protocol):
|
|
@@ -1,13 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
|
|
4
|
+
import warnings
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
3
6
|
|
|
4
7
|
import numpy as np
|
|
5
8
|
import torch
|
|
6
9
|
|
|
7
10
|
from mteb._requires_package import requires_package
|
|
8
11
|
from mteb.models.model_meta import ScoringFunction
|
|
9
|
-
|
|
10
|
-
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
|
|
16
|
+
import faiss
|
|
17
|
+
|
|
18
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
19
|
+
from mteb.types import Array, TopRankedDocumentsType
|
|
20
|
+
|
|
11
21
|
|
|
12
22
|
logger = logging.getLogger(__name__)
|
|
13
23
|
|
|
@@ -32,7 +42,6 @@ class FaissSearchIndex:
|
|
|
32
42
|
install_instruction="pip install mteb[faiss-cpu]",
|
|
33
43
|
)
|
|
34
44
|
|
|
35
|
-
import faiss
|
|
36
45
|
from faiss import IndexFlatIP, IndexFlatL2
|
|
37
46
|
|
|
38
47
|
# https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
|
|
@@ -108,7 +117,7 @@ class FaissSearchIndex:
|
|
|
108
117
|
ids = ids.tolist()
|
|
109
118
|
|
|
110
119
|
if issubclass(self.index_type, faiss.IndexFlatL2):
|
|
111
|
-
similarities = -np.sqrt(np.maximum(similarities, 0))
|
|
120
|
+
similarities = (-np.sqrt(np.maximum(similarities, 0))).tolist()
|
|
112
121
|
|
|
113
122
|
return similarities, ids
|
|
114
123
|
|
|
@@ -116,8 +125,8 @@ class FaissSearchIndex:
|
|
|
116
125
|
self,
|
|
117
126
|
embeddings: Array,
|
|
118
127
|
top_k: int,
|
|
119
|
-
top_ranked: TopRankedDocumentsType
|
|
120
|
-
query_idx_to_id: dict[int, str]
|
|
128
|
+
top_ranked: TopRankedDocumentsType,
|
|
129
|
+
query_idx_to_id: dict[int, str],
|
|
121
130
|
) -> tuple[list[list[float]], list[list[int]]]:
|
|
122
131
|
doc_id_to_idx = {doc_id: i for i, doc_id in enumerate(self.idxs)}
|
|
123
132
|
scores_all: list[list[float]] = []
|
|
@@ -127,15 +136,17 @@ class FaissSearchIndex:
|
|
|
127
136
|
query_id = query_idx_to_id[query_idx]
|
|
128
137
|
ranked_ids = top_ranked.get(query_id)
|
|
129
138
|
if not ranked_ids:
|
|
130
|
-
|
|
139
|
+
msg = f"No top-ranked documents for query {query_id}"
|
|
140
|
+
logger.warning(msg)
|
|
141
|
+
warnings.warn(msg)
|
|
131
142
|
scores_all.append([])
|
|
132
143
|
idxs_all.append([])
|
|
133
144
|
continue
|
|
134
145
|
|
|
135
146
|
candidate_indices = [doc_id_to_idx[doc_id] for doc_id in ranked_ids]
|
|
136
|
-
d = self.index.d
|
|
147
|
+
d = self.index.d # type: ignore[union-attr]
|
|
137
148
|
candidate_embs = np.vstack(
|
|
138
|
-
[self.index.reconstruct(idx) for idx in candidate_indices]
|
|
149
|
+
[self.index.reconstruct(idx) for idx in candidate_indices] # type: ignore[union-attr]
|
|
139
150
|
)
|
|
140
151
|
sub_reranking_index = self.index_type(d)
|
|
141
152
|
sub_reranking_index.add(candidate_embs)
|