mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +78 -30
- mteb/_evaluators/any_sts_evaluator.py +13 -6
- mteb/_evaluators/clustering_evaluator.py +13 -5
- mteb/_evaluators/evaluator.py +12 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
- mteb/_evaluators/pair_classification_evaluator.py +17 -7
- mteb/_evaluators/retrieval_evaluator.py +23 -14
- mteb/_evaluators/retrieval_metrics.py +26 -19
- mteb/_evaluators/sklearn_evaluator.py +27 -17
- mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
- mteb/_evaluators/text/summarization_evaluator.py +31 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +9 -3
- mteb/abstasks/_data_filter/task_pipelines.py +10 -2
- mteb/abstasks/_statistics_calculation.py +21 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +78 -44
- mteb/abstasks/aggregate_task_metadata.py +21 -18
- mteb/abstasks/aggregated_task.py +23 -35
- mteb/abstasks/classification.py +39 -18
- mteb/abstasks/clustering.py +37 -20
- mteb/abstasks/clustering_legacy.py +30 -16
- mteb/abstasks/image/image_text_pair_classification.py +26 -9
- mteb/abstasks/multilabel_classification.py +33 -21
- mteb/abstasks/pair_classification.py +44 -19
- mteb/abstasks/regression.py +18 -10
- mteb/abstasks/retrieval.py +82 -52
- mteb/abstasks/retrieval_dataset_loaders.py +50 -39
- mteb/abstasks/sts.py +34 -15
- mteb/abstasks/task_metadata.py +44 -37
- mteb/abstasks/text/bitext_mining.py +57 -35
- mteb/abstasks/text/reranking.py +10 -8
- mteb/abstasks/text/summarization.py +26 -10
- mteb/abstasks/zeroshot_classification.py +27 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +15 -3
- mteb/benchmarks/benchmarks/__init__.py +6 -0
- mteb/benchmarks/benchmarks/benchmarks.py +153 -13
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +189 -31
- mteb/cli/_display_tasks.py +10 -4
- mteb/cli/build_cli.py +112 -13
- mteb/cli/generate_model_card.py +50 -23
- mteb/deprecated_evaluator.py +72 -54
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +71 -47
- mteb/filter_tasks.py +36 -32
- mteb/get_tasks.py +37 -33
- mteb/languages/language_scripts.py +11 -4
- mteb/leaderboard/app.py +172 -37
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +20 -14
- mteb/models/abs_encoder.py +30 -16
- mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +16 -11
- mteb/models/get_model_meta.py +53 -9
- mteb/models/instruct_wrapper.py +41 -13
- mteb/models/model_implementations/align_models.py +11 -5
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +6 -4
- mteb/models/model_implementations/ara_models.py +2 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +85 -22
- mteb/models/model_implementations/bica_model.py +4 -3
- mteb/models/model_implementations/blip2_models.py +13 -6
- mteb/models/model_implementations/blip_models.py +33 -20
- mteb/models/model_implementations/bm25.py +27 -17
- mteb/models/model_implementations/bmretriever_models.py +16 -6
- mteb/models/model_implementations/cadet_models.py +2 -1
- mteb/models/model_implementations/cde_models.py +22 -9
- mteb/models/model_implementations/clip_models.py +18 -10
- mteb/models/model_implementations/clips_models.py +6 -3
- mteb/models/model_implementations/codefuse_models.py +10 -5
- mteb/models/model_implementations/codesage_models.py +6 -3
- mteb/models/model_implementations/cohere_models.py +19 -9
- mteb/models/model_implementations/cohere_v.py +16 -6
- mteb/models/model_implementations/colpali_models.py +10 -6
- mteb/models/model_implementations/colqwen_models.py +24 -38
- mteb/models/model_implementations/colsmol_models.py +5 -3
- mteb/models/model_implementations/conan_models.py +12 -5
- mteb/models/model_implementations/dino_models.py +70 -46
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +18 -9
- mteb/models/model_implementations/e5_v.py +16 -10
- mteb/models/model_implementations/eagerworks_models.py +12 -5
- mteb/models/model_implementations/emillykkejensen_models.py +9 -6
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +3 -2
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +18 -9
- mteb/models/model_implementations/facebookai.py +16 -2
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +13 -8
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
- mteb/models/model_implementations/gritlm_models.py +5 -2
- mteb/models/model_implementations/gte_models.py +34 -13
- mteb/models/model_implementations/hinvec_models.py +7 -2
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +16 -7
- mteb/models/model_implementations/jina_clip.py +58 -14
- mteb/models/model_implementations/jina_models.py +35 -16
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +13 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
- mteb/models/model_implementations/kfst.py +2 -1
- mteb/models/model_implementations/kowshik24_models.py +2 -1
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +8 -2
- mteb/models/model_implementations/listconranker.py +11 -5
- mteb/models/model_implementations/llm2clip_models.py +18 -10
- mteb/models/model_implementations/llm2vec_models.py +28 -14
- mteb/models/model_implementations/mcinext_models.py +12 -3
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +131 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
- mteb/models/model_implementations/mme5_models.py +3 -2
- mteb/models/model_implementations/moco_models.py +15 -8
- mteb/models/model_implementations/mod_models.py +3 -2
- mteb/models/model_implementations/model2vec_models.py +37 -18
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +6 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
- mteb/models/model_implementations/nomic_models.py +47 -19
- mteb/models/model_implementations/nomic_models_vision.py +6 -4
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
- mteb/models/model_implementations/nvidia_models.py +165 -22
- mteb/models/model_implementations/octen_models.py +64 -3
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +30 -17
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
- mteb/models/model_implementations/ops_moa_models.py +10 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
- mteb/models/model_implementations/pawan_models.py +2 -1
- mteb/models/model_implementations/piccolo_models.py +3 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +20 -10
- mteb/models/model_implementations/pylate_models.py +41 -21
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +14 -4
- mteb/models/model_implementations/qzhou_models.py +4 -2
- mteb/models/model_implementations/random_baseline.py +7 -6
- mteb/models/model_implementations/rasgaard_models.py +3 -2
- mteb/models/model_implementations/reasonir_model.py +66 -1
- mteb/models/model_implementations/repllama_models.py +18 -9
- mteb/models/model_implementations/rerankers_custom.py +25 -10
- mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +40 -20
- mteb/models/model_implementations/ruri_models.py +20 -10
- mteb/models/model_implementations/salesforce_models.py +13 -4
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +142 -22
- mteb/models/model_implementations/shuu_model.py +2 -1
- mteb/models/model_implementations/siglip_models.py +39 -24
- mteb/models/model_implementations/slm_models.py +419 -0
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +4 -2
- mteb/models/model_implementations/text2vec_models.py +12 -3
- mteb/models/model_implementations/ua_sentence_models.py +2 -1
- mteb/models/model_implementations/uae_models.py +17 -5
- mteb/models/model_implementations/vdr_models.py +9 -2
- mteb/models/model_implementations/vi_vn_models.py +12 -6
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +14 -7
- mteb/models/model_implementations/voyage_models.py +136 -4
- mteb/models/model_implementations/voyage_v.py +17 -10
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +2 -1
- mteb/models/model_implementations/yuan_models_en.py +3 -2
- mteb/models/model_meta.py +127 -40
- mteb/models/models_protocols.py +43 -22
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
- mteb/models/search_wrappers.py +63 -29
- mteb/models/sentence_transformer_wrapper.py +52 -26
- mteb/models/vllm_wrapper.py +329 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +48 -35
- mteb/results/model_result.py +68 -32
- mteb/results/task_result.py +110 -72
- mteb/similarity_functions.py +19 -9
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +2 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +16 -16
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +44 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +3 -3
- mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +13 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +18 -5
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
|
@@ -1,23 +1,29 @@
|
|
|
1
|
-
import
|
|
2
|
-
from collections.abc import Sequence
|
|
3
|
-
from typing import Any
|
|
1
|
+
from __future__ import annotations
|
|
4
2
|
|
|
5
|
-
|
|
6
|
-
from
|
|
7
|
-
from mteb.types import (
|
|
8
|
-
CorpusDatasetType,
|
|
9
|
-
QueryDatasetType,
|
|
10
|
-
RelevantDocumentsType,
|
|
11
|
-
RetrievalEvaluationResult,
|
|
12
|
-
RetrievalOutputType,
|
|
13
|
-
TopRankedDocumentsType,
|
|
14
|
-
)
|
|
3
|
+
import logging
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
15
5
|
|
|
16
6
|
from .evaluator import Evaluator
|
|
17
7
|
from .retrieval_metrics import (
|
|
18
8
|
calculate_retrieval_scores,
|
|
19
9
|
)
|
|
20
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Sequence
|
|
13
|
+
|
|
14
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
|
+
from mteb.models import SearchProtocol
|
|
16
|
+
from mteb.types import (
|
|
17
|
+
CorpusDatasetType,
|
|
18
|
+
EncodeKwargs,
|
|
19
|
+
QueryDatasetType,
|
|
20
|
+
RelevantDocumentsType,
|
|
21
|
+
RetrievalEvaluationResult,
|
|
22
|
+
RetrievalOutputType,
|
|
23
|
+
TopRankedDocumentsType,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
21
27
|
logger = logging.getLogger(__name__)
|
|
22
28
|
|
|
23
29
|
|
|
@@ -48,7 +54,8 @@ class RetrievalEvaluator(Evaluator):
|
|
|
48
54
|
def __call__( # type: ignore[override]
|
|
49
55
|
self,
|
|
50
56
|
search_model: SearchProtocol,
|
|
51
|
-
encode_kwargs:
|
|
57
|
+
encode_kwargs: EncodeKwargs,
|
|
58
|
+
num_proc: int = 1,
|
|
52
59
|
) -> RetrievalOutputType:
|
|
53
60
|
logger.info("Running retrieval task - Indexing corpus...")
|
|
54
61
|
search_model.index(
|
|
@@ -57,6 +64,7 @@ class RetrievalEvaluator(Evaluator):
|
|
|
57
64
|
hf_split=self.hf_split,
|
|
58
65
|
hf_subset=self.hf_subset,
|
|
59
66
|
encode_kwargs=encode_kwargs,
|
|
67
|
+
num_proc=num_proc,
|
|
60
68
|
)
|
|
61
69
|
logger.info("Running retrieval task - Searching queries...")
|
|
62
70
|
return search_model.search(
|
|
@@ -67,6 +75,7 @@ class RetrievalEvaluator(Evaluator):
|
|
|
67
75
|
hf_subset=self.hf_subset,
|
|
68
76
|
encode_kwargs=encode_kwargs,
|
|
69
77
|
top_ranked=self.top_ranked,
|
|
78
|
+
num_proc=num_proc,
|
|
70
79
|
)
|
|
71
80
|
|
|
72
81
|
def evaluate(
|
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
from collections import defaultdict
|
|
3
|
-
from typing import Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
import pandas as pd
|
|
@@ -8,14 +10,19 @@ import pytrec_eval
|
|
|
8
10
|
from packaging.version import Version
|
|
9
11
|
from sklearn.metrics import auc
|
|
10
12
|
|
|
11
|
-
from mteb.types import
|
|
13
|
+
from mteb.types import RetrievalEvaluationResult
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from collections.abc import Mapping
|
|
17
|
+
|
|
18
|
+
from mteb.types import RelevantDocumentsType
|
|
12
19
|
|
|
13
20
|
logger = logging.getLogger(__name__)
|
|
14
21
|
|
|
15
22
|
|
|
16
23
|
def mrr(
|
|
17
24
|
qrels: RelevantDocumentsType,
|
|
18
|
-
results:
|
|
25
|
+
results: Mapping[str, Mapping[str, float]],
|
|
19
26
|
k_values: list[int],
|
|
20
27
|
) -> dict[str, list[float]]:
|
|
21
28
|
mrr_metrics = defaultdict(list)
|
|
@@ -32,7 +39,7 @@ def mrr(
|
|
|
32
39
|
doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0
|
|
33
40
|
}
|
|
34
41
|
for k in k_values:
|
|
35
|
-
rr = 0
|
|
42
|
+
rr = 0.0
|
|
36
43
|
for rank, hit in enumerate(top_hits[query_id][0:k]):
|
|
37
44
|
if hit[0] in query_relevant_docs:
|
|
38
45
|
rr = 1.0 / (rank + 1)
|
|
@@ -45,8 +52,8 @@ def recall_cap(
|
|
|
45
52
|
qrels: RelevantDocumentsType,
|
|
46
53
|
results: dict[str, dict[str, float]],
|
|
47
54
|
k_values: list[int],
|
|
48
|
-
) -> dict[str, list[float]]:
|
|
49
|
-
capped_recall = defaultdict(list)
|
|
55
|
+
) -> dict[str, list[float | None]]:
|
|
56
|
+
capped_recall: dict[str, list[float | None]] = defaultdict(list)
|
|
50
57
|
|
|
51
58
|
k_max = max(k_values)
|
|
52
59
|
|
|
@@ -139,7 +146,7 @@ def calculate_pmrr(original_run, new_run, changed_qrels):
|
|
|
139
146
|
changes = []
|
|
140
147
|
for qid in changed_qrels.keys():
|
|
141
148
|
if qid + "-og" not in original_run or qid + "-changed" not in new_run:
|
|
142
|
-
|
|
149
|
+
logger.warning(f"Query {qid} not found in the runs for calculating p-MRR")
|
|
143
150
|
continue
|
|
144
151
|
original_qid_run = original_run[qid + "-og"]
|
|
145
152
|
new_qid_run = new_run[qid + "-changed"]
|
|
@@ -188,7 +195,7 @@ def evaluate_p_mrr_change(
|
|
|
188
195
|
Returns:
|
|
189
196
|
A dictionary with the scores, including "p-MRR", "og" and "changed" keys.
|
|
190
197
|
"""
|
|
191
|
-
followir_scores = defaultdict(dict)
|
|
198
|
+
followir_scores: dict[str, float | dict[str, float]] = defaultdict(dict)
|
|
192
199
|
|
|
193
200
|
qrels_sep = {
|
|
194
201
|
"og": {k: v for k, v in qrels.items() if k.endswith("-og")},
|
|
@@ -227,7 +234,7 @@ def evaluate_p_mrr_change(
|
|
|
227
234
|
ndcg, _map, recall, precision, naucs, avg_mrr, naucs_mrr, cv_recall, {}
|
|
228
235
|
)
|
|
229
236
|
for key, value in scores_dict.items():
|
|
230
|
-
followir_scores[name][key] = value
|
|
237
|
+
followir_scores[name][key] = value # type: ignore[index]
|
|
231
238
|
|
|
232
239
|
return followir_scores
|
|
233
240
|
|
|
@@ -254,8 +261,8 @@ def confidence_scores(sim_scores: list[float]) -> dict[str, float]:
|
|
|
254
261
|
sim_scores_sorted = sorted(sim_scores)[::-1]
|
|
255
262
|
|
|
256
263
|
cs_max = sim_scores_sorted[0]
|
|
257
|
-
cs_std = np.std(sim_scores)
|
|
258
|
-
cs_diff1 =
|
|
264
|
+
cs_std = float(np.std(sim_scores))
|
|
265
|
+
cs_diff1 = 0.0
|
|
259
266
|
if len(sim_scores) > 1:
|
|
260
267
|
cs_diff1 = sim_scores_sorted[0] - sim_scores_sorted[1]
|
|
261
268
|
elif len(sim_scores) == 1:
|
|
@@ -410,7 +417,7 @@ def make_score_dict(
|
|
|
410
417
|
cv_recall: dict[str, float],
|
|
411
418
|
task_scores: dict[str, float],
|
|
412
419
|
previous_results_model_meta: dict[str, Any] | None = None,
|
|
413
|
-
) -> dict[str,
|
|
420
|
+
) -> dict[str, Any]:
|
|
414
421
|
return {
|
|
415
422
|
**{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()},
|
|
416
423
|
**{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()},
|
|
@@ -528,7 +535,7 @@ def max_over_subqueries(
|
|
|
528
535
|
|
|
529
536
|
|
|
530
537
|
def calculate_retrieval_scores(
|
|
531
|
-
results:
|
|
538
|
+
results: Mapping[str, Mapping[str, float]],
|
|
532
539
|
qrels: RelevantDocumentsType,
|
|
533
540
|
k_values: list[int],
|
|
534
541
|
skip_first_result: bool = False,
|
|
@@ -576,7 +583,7 @@ def calculate_retrieval_scores(
|
|
|
576
583
|
|
|
577
584
|
|
|
578
585
|
def evaluate_abstention(
|
|
579
|
-
results:
|
|
586
|
+
results: Mapping[str, Mapping[str, float]],
|
|
580
587
|
metric_scores: dict[str, list[float]],
|
|
581
588
|
) -> dict[str, float]:
|
|
582
589
|
"""Computes normalized Area Under the Curve on a set of evaluated instances as presented in the paper https://arxiv.org/abs/2402.12997
|
|
@@ -591,21 +598,21 @@ def evaluate_abstention(
|
|
|
591
598
|
all_sim_scores = [list(results[qid].values()) for qid in list(results.keys())]
|
|
592
599
|
all_conf_scores = [confidence_scores(sim_scores) for sim_scores in all_sim_scores]
|
|
593
600
|
conf_fcts = list(all_conf_scores[0].keys())
|
|
594
|
-
|
|
601
|
+
all_conf_scores_ = {
|
|
595
602
|
fct: np.array([x[fct] for x in all_conf_scores]) for fct in conf_fcts
|
|
596
603
|
}
|
|
597
|
-
|
|
604
|
+
metric_scores_ = {k: np.array(v) for k, v in metric_scores.items()}
|
|
598
605
|
naucs = {}
|
|
599
606
|
|
|
600
|
-
for metric_name, scores in
|
|
601
|
-
for fct, conf_scores in
|
|
607
|
+
for metric_name, scores in metric_scores_.items():
|
|
608
|
+
for fct, conf_scores in all_conf_scores_.items():
|
|
602
609
|
naucs[f"nAUC_{metric_name}_{fct}"] = nauc(conf_scores, scores)
|
|
603
610
|
|
|
604
611
|
return naucs
|
|
605
612
|
|
|
606
613
|
|
|
607
614
|
def calculate_cv_recall(
|
|
608
|
-
results:
|
|
615
|
+
results: Mapping[str, Mapping[str, float]],
|
|
609
616
|
qrels: RelevantDocumentsType,
|
|
610
617
|
k_values: list[int],
|
|
611
618
|
skip_first_result: bool = False,
|
|
@@ -1,27 +1,31 @@
|
|
|
1
|
-
import
|
|
2
|
-
from typing import Any, Protocol
|
|
1
|
+
from __future__ import annotations
|
|
3
2
|
|
|
4
|
-
import
|
|
5
|
-
from
|
|
6
|
-
from torch.utils.data import DataLoader
|
|
7
|
-
from typing_extensions import Self
|
|
3
|
+
import logging
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Protocol, cast
|
|
8
5
|
|
|
9
6
|
from mteb._create_dataloaders import create_dataloader
|
|
10
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
11
|
-
from mteb.models import EncoderProtocol
|
|
12
|
-
from mteb.types import BatchedInput
|
|
13
7
|
|
|
14
8
|
from .evaluator import Evaluator
|
|
15
9
|
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
import numpy as np
|
|
12
|
+
from datasets import Dataset
|
|
13
|
+
from torch.utils.data import DataLoader
|
|
14
|
+
from typing_extensions import Self
|
|
15
|
+
|
|
16
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
|
+
from mteb.models import EncoderProtocol
|
|
18
|
+
from mteb.types import Array, BatchedInput, EncodeKwargs
|
|
19
|
+
|
|
16
20
|
logger = logging.getLogger(__name__)
|
|
17
21
|
|
|
18
22
|
|
|
19
23
|
class SklearnModelProtocol(Protocol):
|
|
20
|
-
def fit(self, X:
|
|
21
|
-
def predict(self, X:
|
|
24
|
+
def fit(self, X: Array, y: np.ndarray | list[int]) -> None: ... # noqa: N803
|
|
25
|
+
def predict(self, X: Array) -> np.ndarray: ... # noqa: N803
|
|
22
26
|
def get_params(self) -> dict[str, Any]: ...
|
|
23
|
-
def set_params(self, **kwargs: dict[str, Any]) -> Self: ...
|
|
24
|
-
def score(self, X:
|
|
27
|
+
def set_params(self, random_state: int, **kwargs: dict[str, Any]) -> Self: ...
|
|
28
|
+
def score(self, X: Array, y: np.ndarray | list[int]) -> float: ... # noqa: N803
|
|
25
29
|
|
|
26
30
|
|
|
27
31
|
class SklearnEvaluator(Evaluator):
|
|
@@ -50,18 +54,20 @@ class SklearnEvaluator(Evaluator):
|
|
|
50
54
|
self.evaluator_model = evaluator_model
|
|
51
55
|
|
|
52
56
|
def create_dataloaders(
|
|
53
|
-
self, encode_kwargs:
|
|
57
|
+
self, encode_kwargs: EncodeKwargs, num_proc: int
|
|
54
58
|
) -> tuple[DataLoader[BatchedInput], DataLoader[BatchedInput]]:
|
|
55
59
|
dataloader_train = create_dataloader(
|
|
56
60
|
self.train_dataset,
|
|
57
61
|
self.task_metadata,
|
|
58
62
|
input_column=self.values_column_name,
|
|
63
|
+
num_proc=num_proc,
|
|
59
64
|
**encode_kwargs,
|
|
60
65
|
)
|
|
61
66
|
dataloader_test = create_dataloader(
|
|
62
67
|
self.eval_dataset,
|
|
63
68
|
self.task_metadata,
|
|
64
69
|
input_column=self.values_column_name,
|
|
70
|
+
num_proc=num_proc,
|
|
65
71
|
**encode_kwargs,
|
|
66
72
|
)
|
|
67
73
|
return dataloader_train, dataloader_test
|
|
@@ -70,15 +76,17 @@ class SklearnEvaluator(Evaluator):
|
|
|
70
76
|
self,
|
|
71
77
|
model: EncoderProtocol,
|
|
72
78
|
*,
|
|
73
|
-
encode_kwargs:
|
|
74
|
-
test_cache:
|
|
75
|
-
|
|
79
|
+
encode_kwargs: EncodeKwargs,
|
|
80
|
+
test_cache: Array | None = None,
|
|
81
|
+
num_proc: int = 1,
|
|
82
|
+
) -> tuple[np.ndarray, Array]:
|
|
76
83
|
"""Classification evaluation by training a sklearn classifier on the embeddings of the training set and evaluating on the embeddings of the test set.
|
|
77
84
|
|
|
78
85
|
Args:
|
|
79
86
|
model: Encoder
|
|
80
87
|
encode_kwargs: encode kwargs
|
|
81
88
|
test_cache: embeddings of the test set, if already computed
|
|
89
|
+
num_proc: number of processes to use
|
|
82
90
|
|
|
83
91
|
Returns:
|
|
84
92
|
Tuple of test predictions and embeddings
|
|
@@ -86,6 +94,7 @@ class SklearnEvaluator(Evaluator):
|
|
|
86
94
|
"""
|
|
87
95
|
dataloader_train, dataloader_test = self.create_dataloaders(
|
|
88
96
|
encode_kwargs=encode_kwargs,
|
|
97
|
+
num_proc=num_proc,
|
|
89
98
|
)
|
|
90
99
|
|
|
91
100
|
logger.info("Running - Encoding samples...")
|
|
@@ -104,6 +113,7 @@ class SklearnEvaluator(Evaluator):
|
|
|
104
113
|
hf_subset=self.hf_subset,
|
|
105
114
|
**encode_kwargs,
|
|
106
115
|
)
|
|
116
|
+
test_cache = cast("Array", test_cache)
|
|
107
117
|
|
|
108
118
|
logger.info("Running - Fitting classifier...")
|
|
109
119
|
y_train = self.train_dataset[self.label_column_name]
|
|
@@ -1,15 +1,19 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
3
5
|
|
|
4
|
-
import numpy as np
|
|
5
6
|
import torch
|
|
6
7
|
from datasets import Dataset
|
|
7
8
|
from tqdm.auto import tqdm
|
|
8
9
|
|
|
9
10
|
from mteb._create_dataloaders import _create_dataloader_from_texts
|
|
10
11
|
from mteb._evaluators.evaluator import Evaluator
|
|
11
|
-
|
|
12
|
-
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
|
+
from mteb.models import EncoderProtocol
|
|
16
|
+
from mteb.types import Array, EncodeKwargs
|
|
13
17
|
|
|
14
18
|
logger = logging.getLogger(__name__)
|
|
15
19
|
|
|
@@ -33,7 +37,11 @@ class BitextMiningEvaluator(Evaluator):
|
|
|
33
37
|
self.task_metadata = task_metadata
|
|
34
38
|
|
|
35
39
|
def __call__(
|
|
36
|
-
self,
|
|
40
|
+
self,
|
|
41
|
+
model: EncoderProtocol,
|
|
42
|
+
*,
|
|
43
|
+
encode_kwargs: EncodeKwargs,
|
|
44
|
+
num_proc: int = 1,
|
|
37
45
|
) -> dict[str, list[dict[str, float]]]:
|
|
38
46
|
pair_elements = {p for pair in self.pairs for p in pair}
|
|
39
47
|
if isinstance(self.sentences, Dataset):
|
|
@@ -48,6 +56,7 @@ class BitextMiningEvaluator(Evaluator):
|
|
|
48
56
|
for sub in tqdm(subsets):
|
|
49
57
|
dataloader = _create_dataloader_from_texts(
|
|
50
58
|
self.sentences[sub],
|
|
59
|
+
num_proc=num_proc,
|
|
51
60
|
**encode_kwargs,
|
|
52
61
|
)
|
|
53
62
|
embeddings[sub] = model.encode(
|
|
@@ -69,11 +78,11 @@ class BitextMiningEvaluator(Evaluator):
|
|
|
69
78
|
|
|
70
79
|
def _similarity_search(
|
|
71
80
|
self,
|
|
72
|
-
query_embeddings:
|
|
73
|
-
corpus_embeddings:
|
|
81
|
+
query_embeddings: Array,
|
|
82
|
+
corpus_embeddings: Array,
|
|
74
83
|
model: EncoderProtocol,
|
|
75
84
|
query_chunk_size: int = 100,
|
|
76
|
-
corpus_chunk_size: int =
|
|
85
|
+
corpus_chunk_size: int = 500_000,
|
|
77
86
|
) -> list[dict[str, float]]:
|
|
78
87
|
"""This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings.
|
|
79
88
|
|
|
@@ -104,13 +113,15 @@ class BitextMiningEvaluator(Evaluator):
|
|
|
104
113
|
):
|
|
105
114
|
query_embeddings = query_embeddings.to(corpus_embeddings.device)
|
|
106
115
|
|
|
107
|
-
queries_result_list
|
|
116
|
+
queries_result_list: list[list[dict[str, float]]] = [
|
|
117
|
+
[] for _ in range(len(query_embeddings))
|
|
118
|
+
]
|
|
108
119
|
|
|
109
120
|
for query_start_idx in range(0, len(query_embeddings), query_chunk_size):
|
|
110
121
|
# Iterate over chunks of the corpus
|
|
111
122
|
for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
|
|
112
123
|
# Compute cosine similarities
|
|
113
|
-
similarity_scores = model.similarity(
|
|
124
|
+
similarity_scores = model.similarity(
|
|
114
125
|
query_embeddings[
|
|
115
126
|
query_start_idx : query_start_idx + query_chunk_size
|
|
116
127
|
],
|
|
@@ -120,15 +131,17 @@ class BitextMiningEvaluator(Evaluator):
|
|
|
120
131
|
)
|
|
121
132
|
|
|
122
133
|
# Get top-k scores
|
|
123
|
-
|
|
124
|
-
torch.
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
134
|
+
cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = (
|
|
135
|
+
torch.topk(
|
|
136
|
+
torch.tensor(similarity_scores),
|
|
137
|
+
1,
|
|
138
|
+
dim=1,
|
|
139
|
+
largest=True,
|
|
140
|
+
sorted=False,
|
|
141
|
+
)
|
|
129
142
|
)
|
|
130
|
-
cos_scores_top_k_values =
|
|
131
|
-
cos_scores_top_k_idx =
|
|
143
|
+
cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist()
|
|
144
|
+
cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist()
|
|
132
145
|
|
|
133
146
|
for query_itr in range(len(similarity_scores)):
|
|
134
147
|
for sub_corpus_id, score in zip(
|
|
@@ -141,11 +154,14 @@ class BitextMiningEvaluator(Evaluator):
|
|
|
141
154
|
{"corpus_id": corpus_id, "score": score}
|
|
142
155
|
)
|
|
143
156
|
|
|
157
|
+
result_queries_list: list[dict[str, float]] = [
|
|
158
|
+
{} for _ in range(len(query_embeddings))
|
|
159
|
+
]
|
|
144
160
|
# Sort and strip to top_k results
|
|
145
161
|
for idx in range(len(queries_result_list)):
|
|
146
162
|
queries_result_list[idx] = sorted(
|
|
147
163
|
queries_result_list[idx], key=lambda x: x["score"], reverse=True
|
|
148
164
|
)
|
|
149
|
-
|
|
165
|
+
result_queries_list[idx] = queries_result_list[idx][0]
|
|
150
166
|
|
|
151
|
-
return
|
|
167
|
+
return result_queries_list
|
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import sys
|
|
3
|
-
from typing import
|
|
5
|
+
from typing import TYPE_CHECKING, TypedDict
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
import torch
|
|
@@ -9,10 +11,13 @@ from tqdm.auto import tqdm
|
|
|
9
11
|
|
|
10
12
|
from mteb._create_dataloaders import _create_dataloader_from_texts
|
|
11
13
|
from mteb._evaluators.evaluator import Evaluator
|
|
12
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
13
|
-
from mteb.models import EncoderProtocol
|
|
14
14
|
from mteb.similarity_functions import cos_sim, dot_score
|
|
15
15
|
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
18
|
+
from mteb.models import EncoderProtocol
|
|
19
|
+
from mteb.types import EncodeKwargs
|
|
20
|
+
|
|
16
21
|
# if later than python 3.13 use typing module
|
|
17
22
|
if sys.version_info >= (3, 13):
|
|
18
23
|
from warnings import deprecated
|
|
@@ -94,7 +99,8 @@ class SummarizationEvaluator(Evaluator):
|
|
|
94
99
|
self,
|
|
95
100
|
model: EncoderProtocol,
|
|
96
101
|
*,
|
|
97
|
-
encode_kwargs:
|
|
102
|
+
encode_kwargs: EncodeKwargs,
|
|
103
|
+
num_proc: int = 1,
|
|
98
104
|
) -> SummarizationDistances:
|
|
99
105
|
# Get the human & machine summaries for the text in one go for all
|
|
100
106
|
human_lens = [len(human_summaries) for human_summaries in self.human_summaries]
|
|
@@ -110,6 +116,7 @@ class SummarizationEvaluator(Evaluator):
|
|
|
110
116
|
for human_summaries in self.human_summaries
|
|
111
117
|
for summary in human_summaries
|
|
112
118
|
],
|
|
119
|
+
num_proc=num_proc,
|
|
113
120
|
**encode_kwargs,
|
|
114
121
|
),
|
|
115
122
|
task_metadata=self.task_metadata,
|
|
@@ -135,10 +142,10 @@ class SummarizationEvaluator(Evaluator):
|
|
|
135
142
|
)
|
|
136
143
|
|
|
137
144
|
# Split the embeddings into the original human & machine summaries
|
|
138
|
-
|
|
145
|
+
embs_human_summaries_all_split = np.split(
|
|
139
146
|
embs_human_summaries_all, np.cumsum(human_lens)[:-1]
|
|
140
147
|
)
|
|
141
|
-
|
|
148
|
+
embs_machine_summaries_all_split = np.split(
|
|
142
149
|
embs_machine_summaries_all, np.cumsum(machine_lens)[:-1]
|
|
143
150
|
)
|
|
144
151
|
|
|
@@ -148,7 +155,9 @@ class SummarizationEvaluator(Evaluator):
|
|
|
148
155
|
all_human_scores = []
|
|
149
156
|
|
|
150
157
|
for i, (embs_human_summaries, embs_machine_summaries) in tqdm(
|
|
151
|
-
enumerate(
|
|
158
|
+
enumerate(
|
|
159
|
+
zip(embs_human_summaries_all_split, embs_machine_summaries_all_split)
|
|
160
|
+
),
|
|
152
161
|
desc="Scoring",
|
|
153
162
|
total=len(self.human_summaries),
|
|
154
163
|
):
|
|
@@ -164,7 +173,7 @@ class SummarizationEvaluator(Evaluator):
|
|
|
164
173
|
dot_scores = dot_score(emb_machine_summary, embs_human_summaries)
|
|
165
174
|
|
|
166
175
|
_sim_score = [
|
|
167
|
-
float(model.similarity(emb_machine_summary, emb_human_summary))
|
|
176
|
+
float(model.similarity(emb_machine_summary, emb_human_summary))
|
|
168
177
|
for emb_human_summary in embs_human_summaries
|
|
169
178
|
]
|
|
170
179
|
sim_score = torch.tensor(_sim_score)
|
|
@@ -216,17 +225,19 @@ class SummarizationEvaluator(Evaluator):
|
|
|
216
225
|
strict=True,
|
|
217
226
|
):
|
|
218
227
|
cosine_spearman_scores.append(
|
|
219
|
-
spearmanr(human_scores, cosine_pred_scores).statistic
|
|
228
|
+
float(spearmanr(human_scores, cosine_pred_scores).statistic)
|
|
220
229
|
)
|
|
221
230
|
cosine_pearson_scores.append(
|
|
222
|
-
pearsonr(human_scores, cosine_pred_scores).statistic
|
|
231
|
+
float(pearsonr(human_scores, cosine_pred_scores).statistic)
|
|
223
232
|
)
|
|
224
233
|
dot_spearman_scores.append(
|
|
225
|
-
spearmanr(human_scores, dot_pred_scores).statistic
|
|
234
|
+
float(spearmanr(human_scores, dot_pred_scores).statistic)
|
|
226
235
|
)
|
|
227
|
-
dot_pearson_scores.append(
|
|
228
|
-
|
|
229
|
-
|
|
236
|
+
dot_pearson_scores.append(
|
|
237
|
+
float(pearsonr(human_scores, dot_pred_scores).statistic)
|
|
238
|
+
)
|
|
239
|
+
spearman_scores.append(float(spearmanr(human_scores, sim_scores).statistic))
|
|
240
|
+
pearson_scores.append(float(pearsonr(human_scores, sim_scores).statistic))
|
|
230
241
|
|
|
231
242
|
return SummarizationMetrics(
|
|
232
243
|
pearson=float(np.mean(pearson_scores)),
|
|
@@ -273,10 +284,10 @@ class DeprecatedSummarizationEvaluator(SummarizationEvaluator):
|
|
|
273
284
|
pearson_scores.append(pearsonr(human_scores, sim_scores))
|
|
274
285
|
|
|
275
286
|
return SummarizationMetrics(
|
|
276
|
-
pearson=float(np.mean(pearson_scores)),
|
|
277
|
-
spearman=float(np.mean(spearman_scores)),
|
|
278
|
-
cosine_spearman=float(np.mean(cosine_spearman_scores)),
|
|
279
|
-
cosine_pearson=float(np.mean(cosine_pearson_scores)),
|
|
280
|
-
dot_pearson=float(np.mean(dot_pearson_scores)),
|
|
281
|
-
dot_spearman=float(np.mean(dot_spearman_scores)),
|
|
287
|
+
pearson=float(np.mean(pearson_scores)), # type: ignore[arg-type]
|
|
288
|
+
spearman=float(np.mean(spearman_scores)), # type: ignore[arg-type]
|
|
289
|
+
cosine_spearman=float(np.mean(cosine_spearman_scores)), # type: ignore[arg-type]
|
|
290
|
+
cosine_pearson=float(np.mean(cosine_pearson_scores)), # type: ignore[arg-type]
|
|
291
|
+
dot_pearson=float(np.mean(dot_pearson_scores)), # type: ignore[arg-type]
|
|
292
|
+
dot_spearman=float(np.mean(dot_spearman_scores)), # type: ignore[arg-type]
|
|
282
293
|
)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
3
5
|
|
|
4
6
|
from datasets import Dataset
|
|
5
7
|
|
|
@@ -7,13 +9,17 @@ from mteb._create_dataloaders import (
|
|
|
7
9
|
_create_dataloader_from_texts,
|
|
8
10
|
create_dataloader,
|
|
9
11
|
)
|
|
10
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
11
|
-
from mteb.models import EncoderProtocol
|
|
12
12
|
from mteb.similarity_functions import similarity
|
|
13
|
-
from mteb.types import Array
|
|
14
13
|
|
|
15
14
|
from .evaluator import Evaluator
|
|
16
15
|
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from datasets import Dataset
|
|
18
|
+
|
|
19
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
20
|
+
from mteb.models import EncoderProtocol
|
|
21
|
+
from mteb.types import Array, EncodeKwargs
|
|
22
|
+
|
|
17
23
|
logger = logging.getLogger(__name__)
|
|
18
24
|
|
|
19
25
|
|
|
@@ -38,12 +44,17 @@ class ZeroShotClassificationEvaluator(Evaluator):
|
|
|
38
44
|
self.hf_subset = hf_subset
|
|
39
45
|
|
|
40
46
|
def __call__(
|
|
41
|
-
self,
|
|
47
|
+
self,
|
|
48
|
+
model: EncoderProtocol,
|
|
49
|
+
*,
|
|
50
|
+
encode_kwargs: EncodeKwargs,
|
|
51
|
+
num_proc: int = 1,
|
|
42
52
|
) -> Array:
|
|
43
53
|
dataloader = create_dataloader(
|
|
44
54
|
self.dataset,
|
|
45
55
|
input_column=self.input_column_name,
|
|
46
56
|
task_metadata=self.task_metadata,
|
|
57
|
+
num_proc=num_proc,
|
|
47
58
|
**encode_kwargs,
|
|
48
59
|
)
|
|
49
60
|
|
mteb/_helpful_enum.py
CHANGED
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
"""Simplified version of https://gist.github.com/AlexeyVatolin/ea3adc21aa7a767603ff393b22085adc from https://github.com/embeddings-benchmark/mteb/pull/2900"""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import logging
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
4
7
|
|
|
5
8
|
import datasets
|
|
6
9
|
import pandas as pd
|
|
7
|
-
from datasets import
|
|
10
|
+
from datasets import DatasetDict
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from datasets import Dataset
|
|
8
14
|
|
|
9
|
-
from mteb import TaskMetadata
|
|
15
|
+
from mteb import TaskMetadata
|
|
10
16
|
|
|
11
17
|
logger = logging.getLogger(__name__)
|
|
12
18
|
|
|
@@ -61,7 +67,7 @@ def filter_unclear_label(
|
|
|
61
67
|
for text, label in zip(ds[input_column], ds[label_column]):
|
|
62
68
|
key = text.strip().lower()
|
|
63
69
|
normalized.setdefault(key, set()).add(
|
|
64
|
-
label if isinstance(label, (str, int, float)) else tuple(label)
|
|
70
|
+
label if isinstance(label, (str, int, float)) else tuple(label) # type: ignore[arg-type]
|
|
65
71
|
)
|
|
66
72
|
|
|
67
73
|
bad_texts = {t for t, labels in normalized.items() if len(labels) > 1}
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
2
5
|
|
|
3
6
|
from datasets import DatasetDict
|
|
4
7
|
|
|
5
|
-
from mteb import TaskMetadata
|
|
6
|
-
from mteb.abstasks import AbsTaskClassification
|
|
7
8
|
from mteb.abstasks._data_filter.filters import (
|
|
8
9
|
deduplicate,
|
|
9
10
|
filter_empty,
|
|
@@ -13,6 +14,10 @@ from mteb.abstasks._data_filter.filters import (
|
|
|
13
14
|
split_train_test,
|
|
14
15
|
)
|
|
15
16
|
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from mteb import TaskMetadata
|
|
19
|
+
from mteb.abstasks import AbsTaskClassification
|
|
20
|
+
|
|
16
21
|
logger = logging.getLogger(__name__)
|
|
17
22
|
|
|
18
23
|
|
|
@@ -89,6 +94,9 @@ def process_classification(
|
|
|
89
94
|
subset=None,
|
|
90
95
|
)
|
|
91
96
|
|
|
97
|
+
if task.dataset is None:
|
|
98
|
+
raise ValueError("Task dataset is None.")
|
|
99
|
+
|
|
92
100
|
new_ds = {}
|
|
93
101
|
for subset in task.dataset:
|
|
94
102
|
new_ds[subset] = clean_dataset(
|