mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +4 -0
- mteb/_create_dataloaders.py +6 -3
- mteb/_evaluators/any_sts_evaluator.py +21 -12
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +1 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
- mteb/_evaluators/pair_classification_evaluator.py +30 -38
- mteb/_evaluators/sklearn_evaluator.py +15 -28
- mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
- mteb/_evaluators/text/summarization_evaluator.py +4 -2
- mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +102 -0
- mteb/abstasks/_statistics_calculation.py +6 -2
- mteb/abstasks/classification.py +0 -2
- mteb/abstasks/clustering.py +1 -1
- mteb/abstasks/clustering_legacy.py +3 -0
- mteb/abstasks/multilabel_classification.py +10 -3
- mteb/abstasks/pair_classification.py +8 -1
- mteb/abstasks/sts.py +7 -0
- mteb/abstasks/task_metadata.py +1 -0
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +74 -15
- mteb/benchmarks/benchmarks/__init__.py +8 -0
- mteb/benchmarks/benchmarks/benchmarks.py +259 -15
- mteb/benchmarks/get_benchmark.py +2 -0
- mteb/cache.py +47 -10
- mteb/deprecated_evaluator.py +8 -13
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/evaluate.py +65 -45
- mteb/leaderboard/app.py +268 -133
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +21 -17
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
- mteb/models/cache_wrappers/cache_wrapper.py +1 -1
- mteb/models/get_model_meta.py +3 -114
- mteb/models/instruct_wrapper.py +5 -1
- mteb/models/model_implementations/align_models.py +7 -0
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +8 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +60 -0
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +11 -0
- mteb/models/model_implementations/blip_models.py +27 -0
- mteb/models/model_implementations/bm25.py +1 -0
- mteb/models/model_implementations/bmretriever_models.py +4 -0
- mteb/models/model_implementations/cadet_models.py +9 -0
- mteb/models/model_implementations/cde_models.py +14 -0
- mteb/models/model_implementations/clip_models.py +3 -0
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +162 -0
- mteb/models/model_implementations/codesage_models.py +15 -0
- mteb/models/model_implementations/cohere_models.py +8 -1
- mteb/models/model_implementations/cohere_v.py +5 -0
- mteb/models/model_implementations/colpali_models.py +14 -6
- mteb/models/model_implementations/colqwen_models.py +271 -1
- mteb/models/model_implementations/colsmol_models.py +2 -0
- mteb/models/model_implementations/conan_models.py +1 -0
- mteb/models/model_implementations/dino_models.py +171 -0
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +12 -101
- mteb/models/model_implementations/e5_v.py +1 -0
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +58 -0
- mteb/models/model_implementations/facebookai.py +193 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +11 -5
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
- mteb/models/model_implementations/gritlm_models.py +2 -0
- mteb/models/model_implementations/gte_models.py +78 -0
- mteb/models/model_implementations/hinvec_models.py +1 -0
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +255 -2
- mteb/models/model_implementations/jina_clip.py +1 -0
- mteb/models/model_implementations/jina_models.py +209 -5
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +31 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +3 -2
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +3 -0
- mteb/models/model_implementations/llm2vec_models.py +8 -0
- mteb/models/model_implementations/mcinext_models.py +3 -0
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +362 -0
- mteb/models/model_implementations/mme5_models.py +1 -0
- mteb/models/model_implementations/moco_models.py +11 -0
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +13 -0
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/mxbai_models.py +9 -0
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
- mteb/models/model_implementations/nomic_models.py +156 -4
- mteb/models/model_implementations/nomic_models_vision.py +7 -2
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
- mteb/models/model_implementations/nvidia_models.py +4 -1
- mteb/models/model_implementations/octen_models.py +195 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +24 -0
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
- mteb/models/model_implementations/ops_moa_models.py +4 -2
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +8 -0
- mteb/models/model_implementations/promptriever_models.py +8 -4
- mteb/models/model_implementations/pylate_models.py +37 -4
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +6 -3
- mteb/models/model_implementations/qzhou_models.py +3 -1
- mteb/models/model_implementations/random_baseline.py +16 -21
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +1 -0
- mteb/models/model_implementations/repllama_models.py +2 -0
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +51 -0
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +3 -0
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +57 -0
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +10 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/ua_sentence_models.py +10 -0
- mteb/models/model_implementations/uae_models.py +1 -0
- mteb/models/model_implementations/vdr_models.py +2 -0
- mteb/models/model_implementations/vi_vn_models.py +39 -0
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +2 -0
- mteb/models/model_implementations/voyage_models.py +15 -0
- mteb/models/model_implementations/voyage_v.py +8 -2
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +442 -22
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
- mteb/models/search_wrappers.py +165 -48
- mteb/models/sentence_transformer_wrapper.py +2 -7
- mteb/results/benchmark_results.py +88 -47
- mteb/results/model_result.py +11 -4
- mteb/results/task_result.py +37 -19
- mteb/similarity_functions.py +49 -0
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +1 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +2 -1
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +22 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/_encoder_io.py +7 -2
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
mteb/__init__.py
CHANGED
|
@@ -9,8 +9,10 @@ from mteb.filter_tasks import filter_tasks
|
|
|
9
9
|
from mteb.get_tasks import get_task, get_tasks
|
|
10
10
|
from mteb.load_results import load_results
|
|
11
11
|
from mteb.models import (
|
|
12
|
+
CacheBackendProtocol,
|
|
12
13
|
CrossEncoderProtocol,
|
|
13
14
|
EncoderProtocol,
|
|
15
|
+
IndexEncoderSearchProtocol,
|
|
14
16
|
SearchProtocol,
|
|
15
17
|
SentenceTransformerEncoderWrapper,
|
|
16
18
|
)
|
|
@@ -27,8 +29,10 @@ __all__ = [
|
|
|
27
29
|
"AbsTask",
|
|
28
30
|
"Benchmark",
|
|
29
31
|
"BenchmarkResults",
|
|
32
|
+
"CacheBackendProtocol",
|
|
30
33
|
"CrossEncoderProtocol",
|
|
31
34
|
"EncoderProtocol",
|
|
35
|
+
"IndexEncoderSearchProtocol",
|
|
32
36
|
"SearchProtocol",
|
|
33
37
|
"SentenceTransformerEncoderWrapper",
|
|
34
38
|
"TaskMetadata",
|
mteb/_create_dataloaders.py
CHANGED
|
@@ -3,7 +3,7 @@ from collections.abc import Callable
|
|
|
3
3
|
from typing import Any, cast
|
|
4
4
|
|
|
5
5
|
import torch
|
|
6
|
-
from datasets import Dataset
|
|
6
|
+
from datasets import Dataset, Image
|
|
7
7
|
from torch.utils.data import DataLoader, default_collate
|
|
8
8
|
|
|
9
9
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
@@ -22,12 +22,14 @@ logger = logging.getLogger(__name__)
|
|
|
22
22
|
def _create_dataloader_from_texts(
|
|
23
23
|
text: list[str],
|
|
24
24
|
batch_size: int = 32,
|
|
25
|
+
**kwargs: dict[str, Any],
|
|
25
26
|
) -> DataLoader[TextInput]:
|
|
26
27
|
"""Create a dataloader from a list of text.
|
|
27
28
|
|
|
28
29
|
Args:
|
|
29
30
|
text: A list of text to create a dataloader from.
|
|
30
31
|
batch_size: Batch size for the dataloader.
|
|
32
|
+
kwargs: Not used, present catching extra arguments.
|
|
31
33
|
|
|
32
34
|
Returns:
|
|
33
35
|
A dataloader with the text.
|
|
@@ -244,14 +246,15 @@ def _prepare_image_dataset(
|
|
|
244
246
|
transform: Callable[[Any], Any] | None = None,
|
|
245
247
|
) -> Dataset:
|
|
246
248
|
"""Prepare the image dataset by converting images to RGB and applying transformations."""
|
|
247
|
-
# If the dataset uses a different column name for images, rename it to "image".
|
|
248
249
|
if (
|
|
249
250
|
image_column_name
|
|
250
251
|
and image_column_name in dataset.column_names
|
|
251
252
|
and "image" not in dataset.column_names
|
|
252
253
|
):
|
|
253
254
|
dataset = dataset.rename_column(image_column_name, "image")
|
|
254
|
-
#
|
|
255
|
+
# don't process image if it's already in the correct format
|
|
256
|
+
if isinstance(dataset.features["image"], Image):
|
|
257
|
+
return dataset
|
|
255
258
|
return dataset.map(
|
|
256
259
|
_convert_images_to_rgb,
|
|
257
260
|
fn_kwargs={"image_col_name": "image", "transform": transform},
|
|
@@ -12,6 +12,7 @@ from mteb._create_dataloaders import create_dataloader
|
|
|
12
12
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
13
13
|
from mteb.models import EncoderProtocol
|
|
14
14
|
from mteb.similarity_functions import compute_pairwise_similarity
|
|
15
|
+
from mteb.types import PromptType
|
|
15
16
|
|
|
16
17
|
from .evaluator import Evaluator
|
|
17
18
|
|
|
@@ -42,22 +43,18 @@ class AnySTSEvaluator(Evaluator):
|
|
|
42
43
|
task_metadata: TaskMetadata,
|
|
43
44
|
hf_split: str,
|
|
44
45
|
hf_subset: str,
|
|
46
|
+
input1_prompt_type: PromptType | None,
|
|
47
|
+
input2_prompt_type: PromptType | None,
|
|
45
48
|
**kwargs,
|
|
46
49
|
) -> None:
|
|
47
50
|
super().__init__(**kwargs)
|
|
48
|
-
self.
|
|
49
|
-
|
|
50
|
-
task_metadata,
|
|
51
|
-
input_column=sentences_column_names[0],
|
|
52
|
-
)
|
|
53
|
-
self.second_column = create_dataloader(
|
|
54
|
-
dataset,
|
|
55
|
-
task_metadata,
|
|
56
|
-
input_column=sentences_column_names[1],
|
|
57
|
-
)
|
|
51
|
+
self.dataset = dataset
|
|
52
|
+
self.input_columns = sentences_column_names
|
|
58
53
|
self.task_metadata = task_metadata
|
|
59
54
|
self.hf_split = hf_split
|
|
60
55
|
self.hf_subset = hf_subset
|
|
56
|
+
self.input1_prompt_type = input1_prompt_type
|
|
57
|
+
self.input2_prompt_type = input2_prompt_type
|
|
61
58
|
|
|
62
59
|
def __call__(
|
|
63
60
|
self,
|
|
@@ -67,19 +64,31 @@ class AnySTSEvaluator(Evaluator):
|
|
|
67
64
|
) -> STSEvaluatorScores:
|
|
68
65
|
logger.info("Running semantic similarity - Encoding samples (1/2)")
|
|
69
66
|
embeddings1 = model.encode(
|
|
70
|
-
|
|
67
|
+
create_dataloader(
|
|
68
|
+
self.dataset,
|
|
69
|
+
self.task_metadata,
|
|
70
|
+
input_column=self.input_columns[0],
|
|
71
|
+
**encode_kwargs,
|
|
72
|
+
),
|
|
71
73
|
task_metadata=self.task_metadata,
|
|
72
74
|
hf_split=self.hf_split,
|
|
73
75
|
hf_subset=self.hf_subset,
|
|
76
|
+
prompt_type=self.input1_prompt_type,
|
|
74
77
|
**encode_kwargs,
|
|
75
78
|
)
|
|
76
79
|
|
|
77
80
|
logger.info("Running semantic similarity - Encoding samples (2/2)...")
|
|
78
81
|
embeddings2 = model.encode(
|
|
79
|
-
|
|
82
|
+
create_dataloader(
|
|
83
|
+
self.dataset,
|
|
84
|
+
self.task_metadata,
|
|
85
|
+
input_column=self.input_columns[1],
|
|
86
|
+
**encode_kwargs,
|
|
87
|
+
),
|
|
80
88
|
task_metadata=self.task_metadata,
|
|
81
89
|
hf_split=self.hf_split,
|
|
82
90
|
hf_subset=self.hf_subset,
|
|
91
|
+
prompt_type=self.input2_prompt_type,
|
|
83
92
|
**encode_kwargs,
|
|
84
93
|
)
|
|
85
94
|
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def hamming_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
|
5
|
+
"""Compute the Hamming score (a.k.a. label-based accuracy) for multilabel classification.
|
|
6
|
+
|
|
7
|
+
The Hamming score is the fraction of labels that are correctly predicted for each sample,
|
|
8
|
+
averaged over all samples. For samples where both y_true and y_pred have no labels,
|
|
9
|
+
the score is 1.0 (perfect agreement).
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
y_true: Binary matrix of true labels with shape (n_samples, n_labels)
|
|
13
|
+
y_pred: Binary matrix of predicted labels with shape (n_samples, n_labels)
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
float: Hamming score between 0.0 and 1.0
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
ValueError: If inputs are invalid or have incompatible shapes
|
|
20
|
+
TypeError: If inputs cannot be converted to numpy arrays
|
|
21
|
+
"""
|
|
22
|
+
y_true = np.asarray(y_true)
|
|
23
|
+
y_pred = np.asarray(y_pred)
|
|
24
|
+
|
|
25
|
+
# Check shapes
|
|
26
|
+
if y_true.shape != y_pred.shape:
|
|
27
|
+
raise ValueError(
|
|
28
|
+
f"Shape mismatch: y_true {y_true.shape} != y_pred {y_pred.shape}"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Check if arrays are empty
|
|
32
|
+
if y_true.size == 0:
|
|
33
|
+
raise ValueError("Input arrays cannot be empty")
|
|
34
|
+
|
|
35
|
+
# Ensure 2D arrays
|
|
36
|
+
if y_true.ndim != 2:
|
|
37
|
+
raise ValueError(f"Arrays must be 2D, got {y_true.ndim}D")
|
|
38
|
+
|
|
39
|
+
# Check for binary values
|
|
40
|
+
if not (np.all(np.isin(y_true, [0, 1])) and np.all(np.isin(y_pred, [0, 1]))):
|
|
41
|
+
raise ValueError("Arrays must contain only binary values (0 and 1)")
|
|
42
|
+
|
|
43
|
+
# Convert to boolean for bitwise operations
|
|
44
|
+
y_true_bool = y_true.astype(bool)
|
|
45
|
+
y_pred_bool = y_pred.astype(bool)
|
|
46
|
+
|
|
47
|
+
# Calculate intersection and union for each sample
|
|
48
|
+
intersection = (y_true_bool & y_pred_bool).sum(axis=1)
|
|
49
|
+
union = (y_true_bool | y_pred_bool).sum(axis=1)
|
|
50
|
+
|
|
51
|
+
# Handle division by zero: when union is 0, both are all zeros, so score is 1.0
|
|
52
|
+
scores = np.where(union == 0, 1.0, intersection / union)
|
|
53
|
+
|
|
54
|
+
return float(scores.mean())
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
7
|
import torch.nn.functional as F
|
|
6
8
|
from datasets import Dataset
|
|
7
|
-
from PIL.Image import Image
|
|
8
9
|
from torch.utils.data import DataLoader
|
|
9
10
|
|
|
10
11
|
from mteb._create_dataloaders import (
|
|
@@ -15,6 +16,10 @@ from mteb._requires_package import requires_image_dependencies
|
|
|
15
16
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
16
17
|
from mteb.models.models_protocols import EncoderProtocol
|
|
17
18
|
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from PIL.Image import Image
|
|
21
|
+
|
|
22
|
+
|
|
18
23
|
logger = logging.getLogger(__name__)
|
|
19
24
|
|
|
20
25
|
|
|
@@ -103,7 +108,7 @@ class ImageTextPairClassificationEvaluator(Evaluator):
|
|
|
103
108
|
text_embeddings = model.encode(
|
|
104
109
|
DataLoader(
|
|
105
110
|
Dataset.from_dict({"text": texts}),
|
|
106
|
-
|
|
111
|
+
**encode_kwargs,
|
|
107
112
|
),
|
|
108
113
|
task_metadata=self.task_metadata,
|
|
109
114
|
hf_subset=self.hf_subset,
|
|
@@ -122,8 +127,8 @@ class ImageTextPairClassificationEvaluator(Evaluator):
|
|
|
122
127
|
image_embeddings = model.encode(
|
|
123
128
|
DataLoader(
|
|
124
129
|
CustomImageDataset(images),
|
|
125
|
-
batch_size=encode_kwargs["batch_size"],
|
|
126
130
|
collate_fn=lambda x: {"image": [item["image"] for item in x]},
|
|
131
|
+
**encode_kwargs,
|
|
127
132
|
),
|
|
128
133
|
task_metadata=self.task_metadata,
|
|
129
134
|
hf_subset=self.hf_subset,
|
|
@@ -14,6 +14,7 @@ from mteb._evaluators.evaluator import Evaluator
|
|
|
14
14
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
15
|
from mteb.models import EncoderProtocol
|
|
16
16
|
from mteb.similarity_functions import compute_pairwise_similarity
|
|
17
|
+
from mteb.types import PromptType
|
|
17
18
|
|
|
18
19
|
logger = logging.getLogger(__name__)
|
|
19
20
|
|
|
@@ -60,6 +61,8 @@ class PairClassificationEvaluator(Evaluator):
|
|
|
60
61
|
task_metadata: TaskMetadata,
|
|
61
62
|
hf_split: str,
|
|
62
63
|
hf_subset: str,
|
|
64
|
+
input1_prompt_type: PromptType | None,
|
|
65
|
+
input2_prompt_type: PromptType | None,
|
|
63
66
|
**kwargs,
|
|
64
67
|
) -> None:
|
|
65
68
|
super().__init__(**kwargs)
|
|
@@ -69,6 +72,8 @@ class PairClassificationEvaluator(Evaluator):
|
|
|
69
72
|
self.task_metadata = task_metadata
|
|
70
73
|
self.hf_split = hf_split
|
|
71
74
|
self.hf_subset = hf_subset
|
|
75
|
+
self.input1_prompt_type = input1_prompt_type
|
|
76
|
+
self.input2_prompt_type = input2_prompt_type
|
|
72
77
|
|
|
73
78
|
if len(self.dataset[self.input1_column_name]) != len(
|
|
74
79
|
self.dataset[self.input2_column_name]
|
|
@@ -82,47 +87,34 @@ class PairClassificationEvaluator(Evaluator):
|
|
|
82
87
|
model: EncoderProtocol,
|
|
83
88
|
encode_kwargs: dict[str, Any],
|
|
84
89
|
) -> PairClassificationDistances:
|
|
85
|
-
logger.info("Running pair classification - Encoding
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
self.dataset[self.input1_column_name][:]
|
|
90
|
-
+ self.dataset[self.input2_column_name][:]
|
|
91
|
-
)
|
|
92
|
-
len_sentences1 = len(self.dataset[self.input1_column_name])
|
|
93
|
-
embeddings = self._encode_unique_texts(
|
|
94
|
-
all_sentences,
|
|
95
|
-
model,
|
|
96
|
-
task_metadata=self.task_metadata,
|
|
97
|
-
hf_split=self.hf_split,
|
|
98
|
-
hf_subset=self.hf_subset,
|
|
99
|
-
**encode_kwargs,
|
|
100
|
-
)
|
|
101
|
-
embeddings1 = embeddings[:len_sentences1]
|
|
102
|
-
embeddings2 = embeddings[len_sentences1:]
|
|
103
|
-
else:
|
|
104
|
-
embeddings1 = model.encode(
|
|
105
|
-
create_dataloader(
|
|
106
|
-
self.dataset,
|
|
107
|
-
task_metadata=self.task_metadata,
|
|
108
|
-
input_column=self.input1_column_name,
|
|
109
|
-
),
|
|
90
|
+
logger.info("Running pair classification - Encoding samples (1/2)")
|
|
91
|
+
embeddings1 = model.encode(
|
|
92
|
+
create_dataloader(
|
|
93
|
+
self.dataset,
|
|
110
94
|
task_metadata=self.task_metadata,
|
|
111
|
-
|
|
112
|
-
hf_subset=self.hf_subset,
|
|
95
|
+
input_column=self.input1_column_name,
|
|
113
96
|
**encode_kwargs,
|
|
114
|
-
)
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
97
|
+
),
|
|
98
|
+
task_metadata=self.task_metadata,
|
|
99
|
+
hf_split=self.hf_split,
|
|
100
|
+
hf_subset=self.hf_subset,
|
|
101
|
+
prompt_type=self.input1_prompt_type,
|
|
102
|
+
**encode_kwargs,
|
|
103
|
+
)
|
|
104
|
+
logger.info("Running pair classification - Encoding samples (2/2)")
|
|
105
|
+
embeddings2 = model.encode(
|
|
106
|
+
create_dataloader(
|
|
107
|
+
self.dataset,
|
|
121
108
|
task_metadata=self.task_metadata,
|
|
122
|
-
|
|
123
|
-
hf_subset=self.hf_subset,
|
|
109
|
+
input_column=self.input2_column_name,
|
|
124
110
|
**encode_kwargs,
|
|
125
|
-
)
|
|
111
|
+
),
|
|
112
|
+
task_metadata=self.task_metadata,
|
|
113
|
+
hf_split=self.hf_split,
|
|
114
|
+
hf_subset=self.hf_subset,
|
|
115
|
+
prompt_type=self.input2_prompt_type,
|
|
116
|
+
**encode_kwargs,
|
|
117
|
+
)
|
|
126
118
|
|
|
127
119
|
logger.info("Running pair classification - Evaluating pair similarity...")
|
|
128
120
|
cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
|
|
@@ -168,7 +160,7 @@ class PairClassificationEvaluator(Evaluator):
|
|
|
168
160
|
)
|
|
169
161
|
all_unique_texts_embs = np.asarray(
|
|
170
162
|
model.encode(
|
|
171
|
-
_create_dataloader_from_texts(all_unique_texts),
|
|
163
|
+
_create_dataloader_from_texts(all_unique_texts, **encode_kwargs),
|
|
172
164
|
task_metadata=task_metadata,
|
|
173
165
|
hf_split=hf_split,
|
|
174
166
|
hf_subset=hf_subset,
|
|
@@ -6,7 +6,7 @@ from datasets import Dataset
|
|
|
6
6
|
from torch.utils.data import DataLoader
|
|
7
7
|
from typing_extensions import Self
|
|
8
8
|
|
|
9
|
-
from mteb._create_dataloaders import
|
|
9
|
+
from mteb._create_dataloaders import create_dataloader
|
|
10
10
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
11
11
|
from mteb.models import EncoderProtocol
|
|
12
12
|
from mteb.types import BatchedInput
|
|
@@ -50,33 +50,20 @@ class SklearnEvaluator(Evaluator):
|
|
|
50
50
|
self.evaluator_model = evaluator_model
|
|
51
51
|
|
|
52
52
|
def create_dataloaders(
|
|
53
|
-
self,
|
|
53
|
+
self, encode_kwargs: dict[str, Any]
|
|
54
54
|
) -> tuple[DataLoader[BatchedInput], DataLoader[BatchedInput]]:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
if self.values_column_name != "text":
|
|
68
|
-
self.train_dataset = self.train_dataset.rename_column(
|
|
69
|
-
self.values_column_name, "text"
|
|
70
|
-
)
|
|
71
|
-
self.eval_dataset = self.eval_dataset.rename_column(
|
|
72
|
-
self.values_column_name, "text"
|
|
73
|
-
)
|
|
74
|
-
dataloader_train = DataLoader(self.train_dataset)
|
|
75
|
-
dataloader_test = DataLoader(self.eval_dataset)
|
|
76
|
-
else:
|
|
77
|
-
raise ValueError(
|
|
78
|
-
"ClassificationEvaluator only supports image and text modalities."
|
|
79
|
-
)
|
|
55
|
+
dataloader_train = create_dataloader(
|
|
56
|
+
self.train_dataset,
|
|
57
|
+
self.task_metadata,
|
|
58
|
+
input_column=self.values_column_name,
|
|
59
|
+
**encode_kwargs,
|
|
60
|
+
)
|
|
61
|
+
dataloader_test = create_dataloader(
|
|
62
|
+
self.eval_dataset,
|
|
63
|
+
self.task_metadata,
|
|
64
|
+
input_column=self.values_column_name,
|
|
65
|
+
**encode_kwargs,
|
|
66
|
+
)
|
|
80
67
|
return dataloader_train, dataloader_test
|
|
81
68
|
|
|
82
69
|
def __call__( # type: ignore[override]
|
|
@@ -98,7 +85,7 @@ class SklearnEvaluator(Evaluator):
|
|
|
98
85
|
|
|
99
86
|
"""
|
|
100
87
|
dataloader_train, dataloader_test = self.create_dataloaders(
|
|
101
|
-
|
|
88
|
+
encode_kwargs=encode_kwargs,
|
|
102
89
|
)
|
|
103
90
|
|
|
104
91
|
logger.info("Running - Encoding samples...")
|
|
@@ -46,7 +46,10 @@ class BitextMiningEvaluator(Evaluator):
|
|
|
46
46
|
|
|
47
47
|
embeddings = {}
|
|
48
48
|
for sub in tqdm(subsets):
|
|
49
|
-
dataloader = _create_dataloader_from_texts(
|
|
49
|
+
dataloader = _create_dataloader_from_texts(
|
|
50
|
+
self.sentences[sub],
|
|
51
|
+
**encode_kwargs,
|
|
52
|
+
)
|
|
50
53
|
embeddings[sub] = model.encode(
|
|
51
54
|
dataloader,
|
|
52
55
|
task_metadata=self.task_metadata,
|
|
@@ -109,7 +109,8 @@ class SummarizationEvaluator(Evaluator):
|
|
|
109
109
|
summary
|
|
110
110
|
for human_summaries in self.human_summaries
|
|
111
111
|
for summary in human_summaries
|
|
112
|
-
]
|
|
112
|
+
],
|
|
113
|
+
**encode_kwargs,
|
|
113
114
|
),
|
|
114
115
|
task_metadata=self.task_metadata,
|
|
115
116
|
hf_subset=self.hf_subset,
|
|
@@ -124,7 +125,8 @@ class SummarizationEvaluator(Evaluator):
|
|
|
124
125
|
summary
|
|
125
126
|
for machine_summaries in self.machine_summaries
|
|
126
127
|
for summary in machine_summaries
|
|
127
|
-
]
|
|
128
|
+
],
|
|
129
|
+
**encode_kwargs,
|
|
128
130
|
),
|
|
129
131
|
task_metadata=self.task_metadata,
|
|
130
132
|
hf_subset=self.hf_subset,
|
|
@@ -42,14 +42,14 @@ class ZeroShotClassificationEvaluator(Evaluator):
|
|
|
42
42
|
) -> Array:
|
|
43
43
|
dataloader = create_dataloader(
|
|
44
44
|
self.dataset,
|
|
45
|
-
batch_size=encode_kwargs["batch_size"],
|
|
46
45
|
input_column=self.input_column_name,
|
|
47
46
|
task_metadata=self.task_metadata,
|
|
47
|
+
**encode_kwargs,
|
|
48
48
|
)
|
|
49
49
|
|
|
50
50
|
logger.info("Running zero-shot classification - Encoding labels...")
|
|
51
51
|
text_label_embeddings = model.encode(
|
|
52
|
-
_create_dataloader_from_texts(self.candidate_labels),
|
|
52
|
+
_create_dataloader_from_texts(self.candidate_labels, **encode_kwargs),
|
|
53
53
|
task_metadata=self.task_metadata,
|
|
54
54
|
hf_subset=self.hf_subset,
|
|
55
55
|
hf_split=self.hf_split,
|
|
File without changes
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Simplified version of https://gist.github.com/AlexeyVatolin/ea3adc21aa7a767603ff393b22085adc from https://github.com/embeddings-benchmark/mteb/pull/2900"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
import datasets
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from datasets import Dataset, DatasetDict
|
|
8
|
+
|
|
9
|
+
from mteb import TaskMetadata
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def deduplicate(dataset: Dataset, input_column: str) -> Dataset:
|
|
15
|
+
"""Remove duplicate texts, keeping the first occurrence."""
|
|
16
|
+
unique_texts = set()
|
|
17
|
+
indices_to_keep = []
|
|
18
|
+
for i, text in enumerate(dataset[input_column]):
|
|
19
|
+
text = text.strip()
|
|
20
|
+
if text not in unique_texts:
|
|
21
|
+
unique_texts.add(text)
|
|
22
|
+
indices_to_keep.append(i)
|
|
23
|
+
|
|
24
|
+
logger.info(
|
|
25
|
+
f"[deduplicate] removed={len(dataset) - len(indices_to_keep)}/{len(dataset)}"
|
|
26
|
+
)
|
|
27
|
+
return dataset.select(indices_to_keep)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def filter_empty(dataset: Dataset, input_column: str) -> Dataset:
|
|
31
|
+
"""Filter out empty or whitespace-only examples."""
|
|
32
|
+
before = len(dataset)
|
|
33
|
+
ds = dataset.filter(lambda x: len(x[input_column].strip()) > 0)
|
|
34
|
+
logger.info(f"[filter_empty] removed={before - len(ds)}/{before}")
|
|
35
|
+
return ds
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def filter_train_leakage(
|
|
39
|
+
train_dataset: Dataset, test_dataset: Dataset, input_column: str
|
|
40
|
+
) -> Dataset:
|
|
41
|
+
"""Remove test examples that appear in training."""
|
|
42
|
+
train_texts = set(train_dataset[input_column])
|
|
43
|
+
before = len(test_dataset)
|
|
44
|
+
indices = [
|
|
45
|
+
i
|
|
46
|
+
for i, text in enumerate(test_dataset[input_column])
|
|
47
|
+
if text not in train_texts
|
|
48
|
+
]
|
|
49
|
+
logger.info(f"[filter_train_leakage] removed={before - len(indices)}/{before}")
|
|
50
|
+
return test_dataset.select(indices)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def filter_unclear_label(
|
|
54
|
+
dataset_dict: DatasetDict, input_column: str, label_column: str
|
|
55
|
+
) -> DatasetDict:
|
|
56
|
+
"""Remove examples where the same text appears with multiple different labels."""
|
|
57
|
+
normalized: dict[str, set[str | tuple[str, ...]]] = {}
|
|
58
|
+
logger.debug("[filter_controversial] scanning dataset for label conflicts...")
|
|
59
|
+
|
|
60
|
+
for split, ds in dataset_dict.items():
|
|
61
|
+
for text, label in zip(ds[input_column], ds[label_column]):
|
|
62
|
+
key = text.strip().lower()
|
|
63
|
+
normalized.setdefault(key, set()).add(
|
|
64
|
+
label if isinstance(label, (str, int, float)) else tuple(label)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
bad_texts = {t for t, labels in normalized.items() if len(labels) > 1}
|
|
68
|
+
logger.info(f"[filter_controversial] Removing {len(bad_texts)} conflicting texts")
|
|
69
|
+
|
|
70
|
+
new_dict = {}
|
|
71
|
+
for split, ds in dataset_dict.items():
|
|
72
|
+
before = len(ds)
|
|
73
|
+
filtered = ds.filter(lambda x: x[input_column].strip().lower() not in bad_texts)
|
|
74
|
+
logger.debug(
|
|
75
|
+
f"[filter_controversial:{split}] removed={before - len(filtered)}/{before}"
|
|
76
|
+
)
|
|
77
|
+
new_dict[split] = filtered
|
|
78
|
+
|
|
79
|
+
return DatasetDict(new_dict)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def filter_short(dataset: Dataset, input_column: str, min_words: int = 3) -> Dataset:
|
|
83
|
+
"""Filter out texts with fewer than `min_words`."""
|
|
84
|
+
before = len(dataset)
|
|
85
|
+
ds = dataset.filter(lambda x: len(x[input_column].strip().split()) >= min_words)
|
|
86
|
+
logger.debug(f"[filter_short] removed={before - len(ds)}/{before}")
|
|
87
|
+
return ds
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def split_train_test(
|
|
91
|
+
ds: DatasetDict,
|
|
92
|
+
metadata: TaskMetadata,
|
|
93
|
+
train_split: str,
|
|
94
|
+
label_column: str,
|
|
95
|
+
) -> DatasetDict:
|
|
96
|
+
if train_split in ds and metadata.eval_splits == train_split:
|
|
97
|
+
before = len(ds[train_split])
|
|
98
|
+
logger.info(
|
|
99
|
+
f"[split_train_test] eval_splits == train_split; performing split on {before} examples"
|
|
100
|
+
)
|
|
101
|
+
ds[train_split] = ds[train_split].cast_column(
|
|
102
|
+
label_column,
|
|
103
|
+
datasets.ClassLabel(names=list(set(ds[train_split][label_column]))),
|
|
104
|
+
)
|
|
105
|
+
label_counts = pd.Series(ds[train_split][label_column]).value_counts()
|
|
106
|
+
one_sample_labels = set(label_counts[label_counts == 1].index.tolist())
|
|
107
|
+
|
|
108
|
+
if one_sample_labels:
|
|
109
|
+
logger.info(
|
|
110
|
+
f"[split_train_test] Removing {len(one_sample_labels)} labels with only one instance"
|
|
111
|
+
)
|
|
112
|
+
ds[train_split] = ds[train_split].filter(
|
|
113
|
+
lambda x: x[label_column] not in one_sample_labels
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
splits = ds[train_split].train_test_split(
|
|
117
|
+
test_size=min(2048, before // 2), seed=42, stratify_by_column=label_column
|
|
118
|
+
)
|
|
119
|
+
ds = DatasetDict({train_split: splits[train_split], "test": splits["test"]})
|
|
120
|
+
metadata.eval_splits = ["test"]
|
|
121
|
+
logger.info(
|
|
122
|
+
f"[split_train_test] Train size={len(ds[train_split])}, Test size={len(ds['test'])}"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return ds
|