mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +78 -30
- mteb/_evaluators/any_sts_evaluator.py +13 -6
- mteb/_evaluators/clustering_evaluator.py +13 -5
- mteb/_evaluators/evaluator.py +12 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
- mteb/_evaluators/pair_classification_evaluator.py +17 -7
- mteb/_evaluators/retrieval_evaluator.py +23 -14
- mteb/_evaluators/retrieval_metrics.py +26 -19
- mteb/_evaluators/sklearn_evaluator.py +27 -17
- mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
- mteb/_evaluators/text/summarization_evaluator.py +31 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +9 -3
- mteb/abstasks/_data_filter/task_pipelines.py +10 -2
- mteb/abstasks/_statistics_calculation.py +21 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +78 -44
- mteb/abstasks/aggregate_task_metadata.py +21 -18
- mteb/abstasks/aggregated_task.py +23 -35
- mteb/abstasks/classification.py +39 -18
- mteb/abstasks/clustering.py +37 -20
- mteb/abstasks/clustering_legacy.py +30 -16
- mteb/abstasks/image/image_text_pair_classification.py +26 -9
- mteb/abstasks/multilabel_classification.py +33 -21
- mteb/abstasks/pair_classification.py +44 -19
- mteb/abstasks/regression.py +18 -10
- mteb/abstasks/retrieval.py +82 -52
- mteb/abstasks/retrieval_dataset_loaders.py +50 -39
- mteb/abstasks/sts.py +34 -15
- mteb/abstasks/task_metadata.py +44 -37
- mteb/abstasks/text/bitext_mining.py +57 -35
- mteb/abstasks/text/reranking.py +10 -8
- mteb/abstasks/text/summarization.py +26 -10
- mteb/abstasks/zeroshot_classification.py +27 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +15 -3
- mteb/benchmarks/benchmarks/__init__.py +6 -0
- mteb/benchmarks/benchmarks/benchmarks.py +153 -13
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +189 -31
- mteb/cli/_display_tasks.py +10 -4
- mteb/cli/build_cli.py +112 -13
- mteb/cli/generate_model_card.py +50 -23
- mteb/deprecated_evaluator.py +72 -54
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +71 -47
- mteb/filter_tasks.py +36 -32
- mteb/get_tasks.py +37 -33
- mteb/languages/language_scripts.py +11 -4
- mteb/leaderboard/app.py +172 -37
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +20 -14
- mteb/models/abs_encoder.py +30 -16
- mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +16 -11
- mteb/models/get_model_meta.py +53 -9
- mteb/models/instruct_wrapper.py +41 -13
- mteb/models/model_implementations/align_models.py +11 -5
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +6 -4
- mteb/models/model_implementations/ara_models.py +2 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +85 -22
- mteb/models/model_implementations/bica_model.py +4 -3
- mteb/models/model_implementations/blip2_models.py +13 -6
- mteb/models/model_implementations/blip_models.py +33 -20
- mteb/models/model_implementations/bm25.py +27 -17
- mteb/models/model_implementations/bmretriever_models.py +16 -6
- mteb/models/model_implementations/cadet_models.py +2 -1
- mteb/models/model_implementations/cde_models.py +22 -9
- mteb/models/model_implementations/clip_models.py +18 -10
- mteb/models/model_implementations/clips_models.py +6 -3
- mteb/models/model_implementations/codefuse_models.py +10 -5
- mteb/models/model_implementations/codesage_models.py +6 -3
- mteb/models/model_implementations/cohere_models.py +19 -9
- mteb/models/model_implementations/cohere_v.py +16 -6
- mteb/models/model_implementations/colpali_models.py +10 -6
- mteb/models/model_implementations/colqwen_models.py +24 -38
- mteb/models/model_implementations/colsmol_models.py +5 -3
- mteb/models/model_implementations/conan_models.py +12 -5
- mteb/models/model_implementations/dino_models.py +70 -46
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +18 -9
- mteb/models/model_implementations/e5_v.py +16 -10
- mteb/models/model_implementations/eagerworks_models.py +12 -5
- mteb/models/model_implementations/emillykkejensen_models.py +9 -6
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +3 -2
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +18 -9
- mteb/models/model_implementations/facebookai.py +16 -2
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +13 -8
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
- mteb/models/model_implementations/gritlm_models.py +5 -2
- mteb/models/model_implementations/gte_models.py +34 -13
- mteb/models/model_implementations/hinvec_models.py +7 -2
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +16 -7
- mteb/models/model_implementations/jina_clip.py +58 -14
- mteb/models/model_implementations/jina_models.py +35 -16
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +13 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
- mteb/models/model_implementations/kfst.py +2 -1
- mteb/models/model_implementations/kowshik24_models.py +2 -1
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +8 -2
- mteb/models/model_implementations/listconranker.py +11 -5
- mteb/models/model_implementations/llm2clip_models.py +18 -10
- mteb/models/model_implementations/llm2vec_models.py +28 -14
- mteb/models/model_implementations/mcinext_models.py +12 -3
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +131 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
- mteb/models/model_implementations/mme5_models.py +3 -2
- mteb/models/model_implementations/moco_models.py +15 -8
- mteb/models/model_implementations/mod_models.py +3 -2
- mteb/models/model_implementations/model2vec_models.py +37 -18
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +6 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
- mteb/models/model_implementations/nomic_models.py +47 -19
- mteb/models/model_implementations/nomic_models_vision.py +6 -4
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
- mteb/models/model_implementations/nvidia_models.py +165 -22
- mteb/models/model_implementations/octen_models.py +64 -3
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +30 -17
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
- mteb/models/model_implementations/ops_moa_models.py +10 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
- mteb/models/model_implementations/pawan_models.py +2 -1
- mteb/models/model_implementations/piccolo_models.py +3 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +20 -10
- mteb/models/model_implementations/pylate_models.py +41 -21
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +14 -4
- mteb/models/model_implementations/qzhou_models.py +4 -2
- mteb/models/model_implementations/random_baseline.py +7 -6
- mteb/models/model_implementations/rasgaard_models.py +3 -2
- mteb/models/model_implementations/reasonir_model.py +66 -1
- mteb/models/model_implementations/repllama_models.py +18 -9
- mteb/models/model_implementations/rerankers_custom.py +25 -10
- mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +40 -20
- mteb/models/model_implementations/ruri_models.py +20 -10
- mteb/models/model_implementations/salesforce_models.py +13 -4
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +142 -22
- mteb/models/model_implementations/shuu_model.py +2 -1
- mteb/models/model_implementations/siglip_models.py +39 -24
- mteb/models/model_implementations/slm_models.py +419 -0
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +4 -2
- mteb/models/model_implementations/text2vec_models.py +12 -3
- mteb/models/model_implementations/ua_sentence_models.py +2 -1
- mteb/models/model_implementations/uae_models.py +17 -5
- mteb/models/model_implementations/vdr_models.py +9 -2
- mteb/models/model_implementations/vi_vn_models.py +12 -6
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +14 -7
- mteb/models/model_implementations/voyage_models.py +136 -4
- mteb/models/model_implementations/voyage_v.py +17 -10
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +2 -1
- mteb/models/model_implementations/yuan_models_en.py +3 -2
- mteb/models/model_meta.py +127 -40
- mteb/models/models_protocols.py +43 -22
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
- mteb/models/search_wrappers.py +63 -29
- mteb/models/sentence_transformer_wrapper.py +52 -26
- mteb/models/vllm_wrapper.py +329 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +48 -35
- mteb/results/model_result.py +68 -32
- mteb/results/task_result.py +110 -72
- mteb/similarity_functions.py +19 -9
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +2 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +16 -16
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +44 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +3 -3
- mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +13 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +18 -5
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
mteb/results/model_result.py
CHANGED
|
@@ -1,29 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import warnings
|
|
3
|
-
from
|
|
4
|
-
from typing import Any, Literal
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
5
6
|
|
|
6
7
|
import numpy as np
|
|
7
8
|
import pandas as pd
|
|
8
9
|
from pydantic import BaseModel, ConfigDict, Field
|
|
9
|
-
from typing_extensions import
|
|
10
|
+
from typing_extensions import overload
|
|
10
11
|
|
|
11
|
-
from mteb.abstasks.abstask import AbsTask
|
|
12
|
-
from mteb.abstasks.task_metadata import (
|
|
13
|
-
TaskDomain,
|
|
14
|
-
TaskType,
|
|
15
|
-
)
|
|
16
12
|
from mteb.types import (
|
|
17
|
-
ISOLanguage,
|
|
18
|
-
ISOLanguageScript,
|
|
19
13
|
Modalities,
|
|
20
|
-
Score,
|
|
21
|
-
ScoresDict,
|
|
22
|
-
SplitName,
|
|
23
14
|
)
|
|
24
15
|
|
|
25
16
|
from .task_result import TaskError, TaskResult
|
|
26
17
|
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import Callable, Iterable
|
|
20
|
+
|
|
21
|
+
from mteb.abstasks.abstask import AbsTask
|
|
22
|
+
from mteb.abstasks.task_metadata import (
|
|
23
|
+
TaskDomain,
|
|
24
|
+
TaskType,
|
|
25
|
+
)
|
|
26
|
+
from mteb.types import (
|
|
27
|
+
ISOLanguage,
|
|
28
|
+
ISOLanguageScript,
|
|
29
|
+
Score,
|
|
30
|
+
ScoresDict,
|
|
31
|
+
SplitName,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
27
35
|
logger = logging.getLogger(__name__)
|
|
28
36
|
|
|
29
37
|
|
|
@@ -58,7 +66,7 @@ def _aggregate_and_pivot(
|
|
|
58
66
|
index=index_columns,
|
|
59
67
|
columns=columns,
|
|
60
68
|
values="score",
|
|
61
|
-
aggfunc=aggregation_fn,
|
|
69
|
+
aggfunc=aggregation_fn, # type: ignore[arg-type]
|
|
62
70
|
).reset_index()
|
|
63
71
|
elif format == "long":
|
|
64
72
|
return (
|
|
@@ -81,7 +89,7 @@ class ModelResult(BaseModel):
|
|
|
81
89
|
model_revision: str | None
|
|
82
90
|
task_results: list[TaskResult]
|
|
83
91
|
default_modalities: list[Modalities] = Field(
|
|
84
|
-
default_factory=lambda: ["text"], alias="modalities"
|
|
92
|
+
default_factory=lambda: [cast("Modalities", "text")], alias="modalities"
|
|
85
93
|
)
|
|
86
94
|
model_config = (
|
|
87
95
|
ConfigDict( # to free up the name model_* which is otherwise protected
|
|
@@ -95,16 +103,17 @@ class ModelResult(BaseModel):
|
|
|
95
103
|
return f"ModelResult(model_name={self.model_name}, model_revision={self.model_revision}, task_results=[...](#{n_entries}))"
|
|
96
104
|
|
|
97
105
|
@classmethod
|
|
98
|
-
def from_validated(cls, **data: dict[str, Any]) ->
|
|
106
|
+
def from_validated(cls, **data: dict[str, Any]) -> ModelResult:
|
|
99
107
|
"""Create a ModelResult from validated data.
|
|
100
108
|
|
|
101
109
|
Args:
|
|
102
110
|
data: The validated data.
|
|
103
111
|
"""
|
|
104
|
-
data["task_results"] = [
|
|
105
|
-
TaskResult.from_validated(**res)
|
|
112
|
+
data["task_results"] = [ # type: ignore[assignment]
|
|
113
|
+
TaskResult.from_validated(**res) # type: ignore[arg-type]
|
|
114
|
+
for res in data["task_results"]
|
|
106
115
|
]
|
|
107
|
-
return cls.model_construct(**data)
|
|
116
|
+
return cls.model_construct(**data) # type: ignore[arg-type]
|
|
108
117
|
|
|
109
118
|
def _filter_tasks(
|
|
110
119
|
self,
|
|
@@ -114,7 +123,7 @@ class ModelResult(BaseModel):
|
|
|
114
123
|
task_types: list[TaskType] | None = None,
|
|
115
124
|
modalities: list[Modalities] | None = None,
|
|
116
125
|
is_public: bool | None = None,
|
|
117
|
-
) ->
|
|
126
|
+
) -> ModelResult:
|
|
118
127
|
new_task_results = []
|
|
119
128
|
for task_result in self.task_results:
|
|
120
129
|
if (task_names is not None) and (task_result.task_name not in task_names):
|
|
@@ -142,7 +151,7 @@ class ModelResult(BaseModel):
|
|
|
142
151
|
task_results=new_task_results,
|
|
143
152
|
)
|
|
144
153
|
|
|
145
|
-
def select_tasks(self, tasks:
|
|
154
|
+
def select_tasks(self, tasks: Iterable[AbsTask]) -> ModelResult:
|
|
146
155
|
"""Select tasks from the ModelResult based on a list of AbsTask objects.
|
|
147
156
|
|
|
148
157
|
Args:
|
|
@@ -160,6 +169,28 @@ class ModelResult(BaseModel):
|
|
|
160
169
|
task_results=new_task_results,
|
|
161
170
|
)
|
|
162
171
|
|
|
172
|
+
@overload
|
|
173
|
+
def _get_scores(
|
|
174
|
+
self,
|
|
175
|
+
splits: list[SplitName] | None = None,
|
|
176
|
+
languages: list[ISOLanguage | ISOLanguageScript] | None = None,
|
|
177
|
+
scripts: list[ISOLanguageScript] | None = None,
|
|
178
|
+
getter: Callable[[ScoresDict], Score] | None = None,
|
|
179
|
+
aggregation: Callable[[list[Score]], Any] | None = None,
|
|
180
|
+
format: Literal["wide"] = "wide",
|
|
181
|
+
) -> dict: ...
|
|
182
|
+
|
|
183
|
+
@overload
|
|
184
|
+
def _get_scores(
|
|
185
|
+
self,
|
|
186
|
+
splits: list[SplitName] | None = None,
|
|
187
|
+
languages: list[ISOLanguage | ISOLanguageScript] | None = None,
|
|
188
|
+
scripts: list[ISOLanguageScript] | None = None,
|
|
189
|
+
getter: Callable[[ScoresDict], Score] | None = None,
|
|
190
|
+
aggregation: Callable[[list[Score]], Any] | None = None,
|
|
191
|
+
format: Literal["long"] = "long",
|
|
192
|
+
) -> list: ...
|
|
193
|
+
|
|
163
194
|
def _get_scores(
|
|
164
195
|
self,
|
|
165
196
|
splits: list[SplitName] | None = None,
|
|
@@ -177,21 +208,24 @@ class ModelResult(BaseModel):
|
|
|
177
208
|
aggregation = aggregation if aggregation is not None else np.mean
|
|
178
209
|
else:
|
|
179
210
|
use_fast = True
|
|
211
|
+
aggregation = cast("Callable[[list[Score]], Any]", aggregation)
|
|
212
|
+
getter = cast("Callable[[ScoresDict], Score]", getter)
|
|
213
|
+
|
|
180
214
|
if format == "wide":
|
|
181
215
|
scores = {}
|
|
182
216
|
for res in self.task_results:
|
|
183
217
|
try:
|
|
184
218
|
if use_fast:
|
|
185
219
|
scores[res.task_name] = res._get_score_fast(
|
|
186
|
-
splits=splits,
|
|
187
|
-
languages=languages,
|
|
220
|
+
splits=splits,
|
|
221
|
+
languages=languages,
|
|
188
222
|
)
|
|
189
223
|
else:
|
|
190
224
|
scores[res.task_name] = res.get_score(
|
|
191
225
|
splits=splits,
|
|
192
226
|
languages=languages,
|
|
193
|
-
aggregation=aggregation,
|
|
194
|
-
getter=getter,
|
|
227
|
+
aggregation=aggregation,
|
|
228
|
+
getter=getter,
|
|
195
229
|
scripts=scripts,
|
|
196
230
|
)
|
|
197
231
|
except Exception as e:
|
|
@@ -206,14 +240,14 @@ class ModelResult(BaseModel):
|
|
|
206
240
|
if use_fast:
|
|
207
241
|
score = task_res._get_score_fast(
|
|
208
242
|
splits=splits,
|
|
209
|
-
languages=languages,
|
|
243
|
+
languages=languages,
|
|
210
244
|
)
|
|
211
245
|
else:
|
|
212
246
|
score = task_res.get_score(
|
|
213
247
|
splits=splits,
|
|
214
248
|
languages=languages,
|
|
215
|
-
aggregation=aggregation,
|
|
216
|
-
getter=getter,
|
|
249
|
+
aggregation=aggregation,
|
|
250
|
+
getter=getter,
|
|
217
251
|
scripts=scripts,
|
|
218
252
|
)
|
|
219
253
|
entry = dict(
|
|
@@ -292,7 +326,9 @@ class ModelResult(BaseModel):
|
|
|
292
326
|
scores_data = self._get_score_for_table()
|
|
293
327
|
|
|
294
328
|
if not scores_data:
|
|
295
|
-
|
|
329
|
+
msg = "No scores data available. Returning empty DataFrame."
|
|
330
|
+
logger.warning(msg)
|
|
331
|
+
warnings.warn(msg)
|
|
296
332
|
return pd.DataFrame()
|
|
297
333
|
|
|
298
334
|
# Create DataFrame
|
|
@@ -315,7 +351,7 @@ class ModelResult(BaseModel):
|
|
|
315
351
|
def __hash__(self) -> int:
|
|
316
352
|
return id(self)
|
|
317
353
|
|
|
318
|
-
def __iter__(self) -> Iterable[TaskResult]:
|
|
354
|
+
def __iter__(self) -> Iterable[TaskResult]: # type: ignore[override]
|
|
319
355
|
return iter(self.task_results)
|
|
320
356
|
|
|
321
357
|
def __getitem__(self, index) -> TaskResult:
|
|
@@ -368,13 +404,13 @@ class ModelResult(BaseModel):
|
|
|
368
404
|
return [task_res.task_name for task_res in self.task_results]
|
|
369
405
|
|
|
370
406
|
@property
|
|
371
|
-
def modalities(self) -> list[
|
|
407
|
+
def modalities(self) -> list[Modalities]:
|
|
372
408
|
"""Get all modalities in the task results.
|
|
373
409
|
|
|
374
410
|
Returns:
|
|
375
411
|
A list of modalities in the task results.
|
|
376
412
|
"""
|
|
377
|
-
mods = []
|
|
413
|
+
mods: list[Modalities] = []
|
|
378
414
|
for task_res in self.task_results:
|
|
379
415
|
task_modalities = getattr(task_res, "modalities", [])
|
|
380
416
|
mods.extend(task_modalities)
|
mteb/results/task_result.py
CHANGED
|
@@ -2,33 +2,42 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
|
-
|
|
5
|
+
import warnings
|
|
6
6
|
from collections import defaultdict
|
|
7
|
-
from collections.abc import Callable, Iterable
|
|
8
7
|
from functools import cached_property
|
|
9
8
|
from importlib.metadata import version
|
|
10
|
-
from
|
|
11
|
-
from typing import Any
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
12
10
|
|
|
13
11
|
import numpy as np
|
|
14
12
|
from huggingface_hub import EvalResult
|
|
15
13
|
from packaging.version import Version
|
|
16
14
|
from pydantic import BaseModel, field_validator
|
|
17
|
-
from typing_extensions import Self
|
|
18
15
|
|
|
16
|
+
from mteb import TaskMetadata
|
|
19
17
|
from mteb._helpful_enum import HelpfulStrEnum
|
|
18
|
+
from mteb.abstasks import AbsTaskClassification
|
|
20
19
|
from mteb.abstasks.abstask import AbsTask
|
|
21
20
|
from mteb.languages import LanguageScripts
|
|
22
21
|
from mteb.models.model_meta import ScoringFunction
|
|
23
22
|
from mteb.types import (
|
|
24
|
-
HFSubset,
|
|
25
|
-
ISOLanguage,
|
|
26
|
-
ISOLanguageScript,
|
|
27
|
-
Score,
|
|
28
23
|
ScoresDict,
|
|
29
24
|
SplitName,
|
|
30
25
|
)
|
|
31
26
|
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from collections.abc import Callable, Iterable, Mapping
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
|
|
31
|
+
from typing_extensions import Self
|
|
32
|
+
|
|
33
|
+
from mteb.abstasks.task_metadata import TaskDomain
|
|
34
|
+
from mteb.types import (
|
|
35
|
+
HFSubset,
|
|
36
|
+
ISOLanguage,
|
|
37
|
+
ISOLanguageScript,
|
|
38
|
+
Score,
|
|
39
|
+
)
|
|
40
|
+
|
|
32
41
|
logger = logging.getLogger(__name__)
|
|
33
42
|
|
|
34
43
|
|
|
@@ -39,67 +48,59 @@ class Criteria(HelpfulStrEnum):
|
|
|
39
48
|
DATASET_REVISION = "dataset_revision"
|
|
40
49
|
|
|
41
50
|
|
|
42
|
-
class ScalaNbClassificationDummy:
|
|
51
|
+
class ScalaNbClassificationDummy(AbsTaskClassification):
|
|
43
52
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
44
53
|
|
|
45
|
-
metadata =
|
|
54
|
+
metadata = TaskMetadata(
|
|
46
55
|
name="ScalaNbClassification",
|
|
56
|
+
description="A dummy",
|
|
47
57
|
main_score="accuracy",
|
|
48
58
|
type="Classification",
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
},
|
|
52
|
-
dataset={"revision": "revision_not_applicable"},
|
|
53
|
-
revision="revision_not_applicable",
|
|
59
|
+
eval_langs=["nob-Latn"],
|
|
60
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
54
61
|
)
|
|
55
62
|
|
|
56
63
|
|
|
57
|
-
class ScalaNnClassificationDummy:
|
|
64
|
+
class ScalaNnClassificationDummy(AbsTaskClassification):
|
|
58
65
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
59
66
|
|
|
60
|
-
metadata =
|
|
67
|
+
metadata = TaskMetadata(
|
|
61
68
|
name="ScalaNnClassification",
|
|
69
|
+
description="A dummy",
|
|
62
70
|
main_score="accuracy",
|
|
63
71
|
type="Classification",
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
},
|
|
67
|
-
dataset={"revision": "revision_not_applicable"},
|
|
68
|
-
revision="revision_not_applicable",
|
|
72
|
+
eval_langs=["nob-Latn"],
|
|
73
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
69
74
|
)
|
|
70
75
|
|
|
71
76
|
|
|
72
|
-
class ScalaDaClassificationDummy:
|
|
77
|
+
class ScalaDaClassificationDummy(AbsTaskClassification):
|
|
73
78
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
74
79
|
|
|
75
|
-
metadata =
|
|
80
|
+
metadata = TaskMetadata(
|
|
76
81
|
name="ScalaDaClassification",
|
|
82
|
+
description="A dummy",
|
|
77
83
|
main_score="accuracy",
|
|
78
84
|
type="Classification",
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
},
|
|
82
|
-
dataset={"revision": "revision_not_applicable"},
|
|
83
|
-
revision="revision_not_applicable",
|
|
85
|
+
eval_langs=["dan-Latn"],
|
|
86
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
84
87
|
)
|
|
85
88
|
|
|
86
89
|
|
|
87
|
-
class ScalaSvClassificationDummy:
|
|
90
|
+
class ScalaSvClassificationDummy(AbsTaskClassification):
|
|
88
91
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
89
92
|
|
|
90
|
-
metadata =
|
|
93
|
+
metadata = TaskMetadata(
|
|
91
94
|
name="ScalaSvClassification",
|
|
95
|
+
description="A dummy",
|
|
92
96
|
main_score="accuracy",
|
|
93
97
|
type="Classification",
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
},
|
|
97
|
-
dataset={"revision": "revision_not_applicable"},
|
|
98
|
-
revision="revision_not_applicable",
|
|
98
|
+
eval_langs=["swe-Latn"],
|
|
99
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
99
100
|
)
|
|
100
101
|
|
|
101
102
|
|
|
102
|
-
outdated_tasks = {
|
|
103
|
+
outdated_tasks: dict[str, type[AbsTask]] = {
|
|
103
104
|
"ScalaNbClassification": ScalaNbClassificationDummy,
|
|
104
105
|
"ScalaNnClassification": ScalaNnClassificationDummy,
|
|
105
106
|
"ScalaDaClassification": ScalaDaClassificationDummy,
|
|
@@ -166,10 +167,10 @@ class TaskResult(BaseModel):
|
|
|
166
167
|
def from_task_results(
|
|
167
168
|
cls,
|
|
168
169
|
task: AbsTask | type[AbsTask],
|
|
169
|
-
scores: dict[SplitName,
|
|
170
|
+
scores: dict[SplitName, Mapping[HFSubset, ScoresDict]],
|
|
170
171
|
evaluation_time: float,
|
|
171
172
|
kg_co2_emissions: float | None = None,
|
|
172
|
-
) ->
|
|
173
|
+
) -> TaskResult:
|
|
173
174
|
"""Create a TaskResult from the task and scores.
|
|
174
175
|
|
|
175
176
|
Args:
|
|
@@ -246,12 +247,12 @@ class TaskResult(BaseModel):
|
|
|
246
247
|
return get_task(self.task_name)
|
|
247
248
|
|
|
248
249
|
@property
|
|
249
|
-
def domains(self) -> list[
|
|
250
|
+
def domains(self) -> list[TaskDomain]:
|
|
250
251
|
"""Get the domains of the task."""
|
|
251
252
|
doms = self.task.metadata.domains
|
|
252
253
|
if doms is None:
|
|
253
254
|
doms = []
|
|
254
|
-
return doms
|
|
255
|
+
return doms
|
|
255
256
|
|
|
256
257
|
@property
|
|
257
258
|
def task_type(self) -> str:
|
|
@@ -307,7 +308,7 @@ class TaskResult(BaseModel):
|
|
|
307
308
|
if isinstance(v, dict):
|
|
308
309
|
self._round_scores(v, n)
|
|
309
310
|
elif isinstance(v, float):
|
|
310
|
-
value[i] = round(v, n)
|
|
311
|
+
value[i] = round(v, n) # type: ignore[call-overload]
|
|
311
312
|
|
|
312
313
|
elif isinstance(value, float):
|
|
313
314
|
scores[key] = round(value, n)
|
|
@@ -325,7 +326,7 @@ class TaskResult(BaseModel):
|
|
|
325
326
|
json.dump(json_obj, f, indent=2)
|
|
326
327
|
|
|
327
328
|
@classmethod
|
|
328
|
-
def from_disk(cls, path: Path, load_historic_data: bool = True) ->
|
|
329
|
+
def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult:
|
|
329
330
|
"""Load TaskResult from disk.
|
|
330
331
|
|
|
331
332
|
Args:
|
|
@@ -356,7 +357,7 @@ class TaskResult(BaseModel):
|
|
|
356
357
|
) # assume it is before 1.11.0 if the version is not present
|
|
357
358
|
|
|
358
359
|
try:
|
|
359
|
-
obj = cls.model_validate(data)
|
|
360
|
+
obj: TaskResult = cls.model_validate(data)
|
|
360
361
|
except Exception as e:
|
|
361
362
|
if not pre_1_11_load:
|
|
362
363
|
raise e
|
|
@@ -381,6 +382,7 @@ class TaskResult(BaseModel):
|
|
|
381
382
|
from mteb import get_task
|
|
382
383
|
|
|
383
384
|
task_name = obj.task_name
|
|
385
|
+
task: AbsTask | type[AbsTask]
|
|
384
386
|
if task_name in outdated_tasks:
|
|
385
387
|
task = outdated_tasks[task_name]
|
|
386
388
|
else:
|
|
@@ -393,11 +395,11 @@ class TaskResult(BaseModel):
|
|
|
393
395
|
for key in list(hf_subset_scores.keys()):
|
|
394
396
|
if isinstance(hf_subset_scores[key], dict):
|
|
395
397
|
for k, v in hf_subset_scores[key].items():
|
|
396
|
-
hf_subset_scores[f"{key}_{k}"] = v
|
|
397
|
-
hf_subset_scores.pop(key)
|
|
398
|
+
hf_subset_scores[f"{key}_{k}"] = v # type: ignore[index]
|
|
399
|
+
hf_subset_scores.pop(key) # type: ignore[attr-defined]
|
|
398
400
|
|
|
399
401
|
@classmethod
|
|
400
|
-
def _convert_from_before_v1_11_0(cls, data: dict) ->
|
|
402
|
+
def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult:
|
|
401
403
|
from mteb.get_tasks import _TASKS_REGISTRY
|
|
402
404
|
|
|
403
405
|
# in case the task name is not found in the registry, try to find a lower case version
|
|
@@ -462,7 +464,9 @@ class TaskResult(BaseModel):
|
|
|
462
464
|
if main_score in hf_subset_scores:
|
|
463
465
|
hf_subset_scores["main_score"] = hf_subset_scores[main_score]
|
|
464
466
|
else:
|
|
465
|
-
|
|
467
|
+
msg = f"Main score {main_score} not found in scores"
|
|
468
|
+
logger.warning(msg)
|
|
469
|
+
warnings.warn(msg)
|
|
466
470
|
hf_subset_scores["main_score"] = None
|
|
467
471
|
|
|
468
472
|
# specific fixes:
|
|
@@ -481,7 +485,7 @@ class TaskResult(BaseModel):
|
|
|
481
485
|
scores["test"]["fra-fra"] = scores["test"].pop("fr")
|
|
482
486
|
|
|
483
487
|
result: TaskResult = TaskResult.from_task_results(
|
|
484
|
-
task,
|
|
488
|
+
task,
|
|
485
489
|
scores,
|
|
486
490
|
evaluation_time,
|
|
487
491
|
kg_co2_emissions=None,
|
|
@@ -532,7 +536,7 @@ class TaskResult(BaseModel):
|
|
|
532
536
|
def _get_score_fast(
|
|
533
537
|
self,
|
|
534
538
|
splits: Iterable[str] | None = None,
|
|
535
|
-
languages:
|
|
539
|
+
languages: list[ISOLanguage | ISOLanguageScript] | None = None,
|
|
536
540
|
subsets: Iterable[str] | None = None,
|
|
537
541
|
) -> float:
|
|
538
542
|
"""Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified.
|
|
@@ -581,7 +585,7 @@ class TaskResult(BaseModel):
|
|
|
581
585
|
return val_sum / n_val
|
|
582
586
|
|
|
583
587
|
@classmethod
|
|
584
|
-
def from_validated(cls, **data) ->
|
|
588
|
+
def from_validated(cls, **data) -> TaskResult:
|
|
585
589
|
"""Create a TaskResult from validated data.
|
|
586
590
|
|
|
587
591
|
Returns:
|
|
@@ -592,13 +596,13 @@ class TaskResult(BaseModel):
|
|
|
592
596
|
def __repr__(self) -> str:
|
|
593
597
|
return f"TaskResult(task_name={self.task_name}, scores=...)"
|
|
594
598
|
|
|
595
|
-
def only_main_score(self) ->
|
|
599
|
+
def only_main_score(self) -> TaskResult:
|
|
596
600
|
"""Return a new TaskResult object with only the main score.
|
|
597
601
|
|
|
598
602
|
Returns:
|
|
599
603
|
A new TaskResult object with only the main score.
|
|
600
604
|
"""
|
|
601
|
-
new_scores = {}
|
|
605
|
+
new_scores: dict[str, list[Score]] = {}
|
|
602
606
|
for split in self.scores:
|
|
603
607
|
new_scores[split] = []
|
|
604
608
|
for subset_scores in self.scores[split]:
|
|
@@ -610,10 +614,12 @@ class TaskResult(BaseModel):
|
|
|
610
614
|
}
|
|
611
615
|
)
|
|
612
616
|
new_res = {**self.to_dict(), "scores": new_scores}
|
|
613
|
-
|
|
614
|
-
return new_res
|
|
617
|
+
return TaskResult.from_validated(**new_res)
|
|
615
618
|
|
|
616
|
-
def validate_and_filter_scores(
|
|
619
|
+
def validate_and_filter_scores(
|
|
620
|
+
self,
|
|
621
|
+
task: AbsTask | None = None,
|
|
622
|
+
) -> TaskResult:
|
|
617
623
|
"""Validate and filter the scores against the task metadata.
|
|
618
624
|
|
|
619
625
|
This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata.
|
|
@@ -635,22 +641,32 @@ class TaskResult(BaseModel):
|
|
|
635
641
|
splits = task.eval_splits
|
|
636
642
|
hf_subsets = set(task.hf_subsets) # Convert to set once
|
|
637
643
|
|
|
638
|
-
new_scores = {}
|
|
644
|
+
new_scores: dict[str, list[Score]] = {}
|
|
639
645
|
seen_splits = set()
|
|
640
646
|
for split in self.scores:
|
|
641
647
|
if split not in splits:
|
|
642
648
|
continue
|
|
643
649
|
seen_subsets = set()
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
+
if task.is_aggregate:
|
|
651
|
+
# aggregate tasks only have the default subset, but in metadata can be multiple
|
|
652
|
+
new_scores[split] = [
|
|
653
|
+
_scores
|
|
654
|
+
for _scores in self.scores[split]
|
|
655
|
+
if _scores["hf_subset"] == "default"
|
|
656
|
+
]
|
|
657
|
+
seen_subsets = {"default"}
|
|
658
|
+
else:
|
|
659
|
+
new_scores[split] = [
|
|
660
|
+
_scores
|
|
661
|
+
for _scores in self.scores[split]
|
|
662
|
+
if _scores["hf_subset"] in hf_subsets
|
|
663
|
+
]
|
|
650
664
|
for _scores in new_scores[split]:
|
|
651
665
|
seen_subsets.add(_scores["hf_subset"])
|
|
652
666
|
|
|
653
|
-
if seen_subsets != hf_subsets
|
|
667
|
+
if seen_subsets != hf_subsets and not (
|
|
668
|
+
task.is_aggregate and "default" in seen_subsets
|
|
669
|
+
):
|
|
654
670
|
missing_subsets = hf_subsets - seen_subsets
|
|
655
671
|
if len(missing_subsets) > 2:
|
|
656
672
|
subset1, subset2 = list(missing_subsets)[:2]
|
|
@@ -658,14 +674,36 @@ class TaskResult(BaseModel):
|
|
|
658
674
|
else:
|
|
659
675
|
missing_subsets_str = str(missing_subsets)
|
|
660
676
|
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
)
|
|
677
|
+
msg = f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
|
|
678
|
+
logger.warning(msg)
|
|
679
|
+
warnings.warn(msg)
|
|
680
|
+
for missing_subset in missing_subsets:
|
|
681
|
+
new_scores[split].append(
|
|
682
|
+
{
|
|
683
|
+
"hf_subset": missing_subset,
|
|
684
|
+
"main_score": np.nan,
|
|
685
|
+
"languages": task.metadata.hf_subsets_to_langscripts.get(
|
|
686
|
+
missing_subset, []
|
|
687
|
+
),
|
|
688
|
+
}
|
|
689
|
+
)
|
|
664
690
|
seen_splits.add(split)
|
|
665
691
|
if seen_splits != set(splits):
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
)
|
|
692
|
+
msg = f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
|
|
693
|
+
logger.warning(msg)
|
|
694
|
+
warnings.warn(msg)
|
|
695
|
+
for missing_split in set(splits) - seen_splits:
|
|
696
|
+
new_scores[missing_split] = []
|
|
697
|
+
for missing_subset in hf_subsets:
|
|
698
|
+
new_scores[missing_split].append(
|
|
699
|
+
{
|
|
700
|
+
"hf_subset": missing_subset,
|
|
701
|
+
"main_score": np.nan,
|
|
702
|
+
"languages": task.metadata.hf_subsets_to_langscripts.get(
|
|
703
|
+
missing_subset, []
|
|
704
|
+
),
|
|
705
|
+
}
|
|
706
|
+
)
|
|
669
707
|
data = self.model_dump()
|
|
670
708
|
data["scores"] = new_scores
|
|
671
709
|
return type(self).model_construct(**data)
|
|
@@ -736,7 +774,7 @@ class TaskResult(BaseModel):
|
|
|
736
774
|
"mteb_version",
|
|
737
775
|
"dataset_revision",
|
|
738
776
|
],
|
|
739
|
-
) ->
|
|
777
|
+
) -> TaskResult:
|
|
740
778
|
"""Merges two TaskResult objects.
|
|
741
779
|
|
|
742
780
|
Args:
|
mteb/similarity_functions.py
CHANGED
|
@@ -1,8 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
1
5
|
import torch
|
|
2
6
|
|
|
3
|
-
from mteb.models import EncoderProtocol
|
|
4
7
|
from mteb.models.model_meta import ScoringFunction
|
|
5
|
-
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from mteb.models import EncoderProtocol
|
|
11
|
+
from mteb.types import Array
|
|
6
12
|
|
|
7
13
|
|
|
8
14
|
def _use_torch_compile():
|
|
@@ -186,7 +192,7 @@ def max_sim(a: Array, b: Array) -> torch.Tensor:
|
|
|
186
192
|
b,
|
|
187
193
|
)
|
|
188
194
|
|
|
189
|
-
return scores.max(axis=-1).values.sum(axis=-1)
|
|
195
|
+
return scores.max(axis=-1).values.sum(axis=-1) # type: ignore[call-overload]
|
|
190
196
|
|
|
191
197
|
|
|
192
198
|
# https://github.com/lightonai/pylate/blob/2d094a724866d6e15701781528368438081c0157/pylate/scores/scores.py#L67C1-L122C38
|
|
@@ -217,7 +223,7 @@ def pairwise_max_sim(
|
|
|
217
223
|
document_embedding,
|
|
218
224
|
)
|
|
219
225
|
|
|
220
|
-
scores.append(query_document_score.max(axis=-1).values.sum())
|
|
226
|
+
scores.append(query_document_score.max(axis=-1).values.sum()) # type: ignore[call-overload]
|
|
221
227
|
|
|
222
228
|
return torch.stack(scores, dim=0)
|
|
223
229
|
|
|
@@ -317,11 +323,15 @@ def similarity(text_embeddings: Array, input_embeddings: Array) -> Array:
|
|
|
317
323
|
Returns:
|
|
318
324
|
Matrix with similarities
|
|
319
325
|
"""
|
|
320
|
-
|
|
321
|
-
|
|
326
|
+
text_embeddings_tensor = _convert_to_tensor(text_embeddings)
|
|
327
|
+
input_embeddings_tensor = _convert_to_tensor(input_embeddings)
|
|
322
328
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
329
|
+
text_embeddings_tensor = text_embeddings_tensor / text_embeddings_tensor.norm(
|
|
330
|
+
dim=-1, keepdim=True
|
|
331
|
+
)
|
|
332
|
+
input_embeddings_tensor = input_embeddings_tensor / input_embeddings_tensor.norm(
|
|
333
|
+
dim=-1, keepdim=True
|
|
334
|
+
)
|
|
335
|
+
logits = torch.matmul(input_embeddings_tensor, text_embeddings_tensor.T)
|
|
326
336
|
probs = (logits * 100).softmax(dim=-1)
|
|
327
337
|
return probs
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from mteb.abstasks import
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.retrieval import (
|
|
4
4
|
CQADupstackAndroidRetrieval,
|
|
5
5
|
CQADupstackEnglishRetrieval,
|
|
@@ -15,7 +15,7 @@ from mteb.tasks.retrieval import (
|
|
|
15
15
|
CQADupstackWordpressRetrieval,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
-
task_list_cqa
|
|
18
|
+
task_list_cqa = [
|
|
19
19
|
CQADupstackAndroidRetrieval(),
|
|
20
20
|
CQADupstackEnglishRetrieval(),
|
|
21
21
|
CQADupstackGamingRetrieval(),
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from mteb.abstasks.
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.sts.multilingual.sts17_multilingual_visual_sts import (
|
|
4
4
|
STS17MultilingualVisualSTS,
|
|
5
5
|
)
|
|
6
6
|
|
|
7
|
-
task_list_sts17
|
|
7
|
+
task_list_sts17 = [
|
|
8
8
|
STS17MultilingualVisualSTS().filter_languages(
|
|
9
9
|
languages=["eng"], hf_subsets=["en-en"]
|
|
10
10
|
)
|