mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +4 -0
- mteb/_create_dataloaders.py +6 -3
- mteb/_evaluators/any_sts_evaluator.py +21 -12
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +1 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
- mteb/_evaluators/pair_classification_evaluator.py +30 -38
- mteb/_evaluators/sklearn_evaluator.py +15 -28
- mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
- mteb/_evaluators/text/summarization_evaluator.py +4 -2
- mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +102 -0
- mteb/abstasks/_statistics_calculation.py +6 -2
- mteb/abstasks/classification.py +0 -2
- mteb/abstasks/clustering.py +1 -1
- mteb/abstasks/clustering_legacy.py +3 -0
- mteb/abstasks/multilabel_classification.py +10 -3
- mteb/abstasks/pair_classification.py +8 -1
- mteb/abstasks/sts.py +7 -0
- mteb/abstasks/task_metadata.py +1 -0
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +74 -15
- mteb/benchmarks/benchmarks/__init__.py +8 -0
- mteb/benchmarks/benchmarks/benchmarks.py +259 -15
- mteb/benchmarks/get_benchmark.py +2 -0
- mteb/cache.py +47 -10
- mteb/deprecated_evaluator.py +8 -13
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/evaluate.py +65 -45
- mteb/leaderboard/app.py +268 -133
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +21 -17
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
- mteb/models/cache_wrappers/cache_wrapper.py +1 -1
- mteb/models/get_model_meta.py +3 -114
- mteb/models/instruct_wrapper.py +5 -1
- mteb/models/model_implementations/align_models.py +7 -0
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +8 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +60 -0
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +11 -0
- mteb/models/model_implementations/blip_models.py +27 -0
- mteb/models/model_implementations/bm25.py +1 -0
- mteb/models/model_implementations/bmretriever_models.py +4 -0
- mteb/models/model_implementations/cadet_models.py +9 -0
- mteb/models/model_implementations/cde_models.py +14 -0
- mteb/models/model_implementations/clip_models.py +3 -0
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +162 -0
- mteb/models/model_implementations/codesage_models.py +15 -0
- mteb/models/model_implementations/cohere_models.py +8 -1
- mteb/models/model_implementations/cohere_v.py +5 -0
- mteb/models/model_implementations/colpali_models.py +14 -6
- mteb/models/model_implementations/colqwen_models.py +271 -1
- mteb/models/model_implementations/colsmol_models.py +2 -0
- mteb/models/model_implementations/conan_models.py +1 -0
- mteb/models/model_implementations/dino_models.py +171 -0
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +12 -101
- mteb/models/model_implementations/e5_v.py +1 -0
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +58 -0
- mteb/models/model_implementations/facebookai.py +193 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +11 -5
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
- mteb/models/model_implementations/gritlm_models.py +2 -0
- mteb/models/model_implementations/gte_models.py +78 -0
- mteb/models/model_implementations/hinvec_models.py +1 -0
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +255 -2
- mteb/models/model_implementations/jina_clip.py +1 -0
- mteb/models/model_implementations/jina_models.py +209 -5
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +31 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +3 -2
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +3 -0
- mteb/models/model_implementations/llm2vec_models.py +8 -0
- mteb/models/model_implementations/mcinext_models.py +3 -0
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +362 -0
- mteb/models/model_implementations/mme5_models.py +1 -0
- mteb/models/model_implementations/moco_models.py +11 -0
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +13 -0
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/mxbai_models.py +9 -0
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
- mteb/models/model_implementations/nomic_models.py +156 -4
- mteb/models/model_implementations/nomic_models_vision.py +7 -2
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
- mteb/models/model_implementations/nvidia_models.py +4 -1
- mteb/models/model_implementations/octen_models.py +195 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +24 -0
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
- mteb/models/model_implementations/ops_moa_models.py +4 -2
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +8 -0
- mteb/models/model_implementations/promptriever_models.py +8 -4
- mteb/models/model_implementations/pylate_models.py +37 -4
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +6 -3
- mteb/models/model_implementations/qzhou_models.py +3 -1
- mteb/models/model_implementations/random_baseline.py +16 -21
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +1 -0
- mteb/models/model_implementations/repllama_models.py +2 -0
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +51 -0
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +3 -0
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +57 -0
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +10 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/ua_sentence_models.py +10 -0
- mteb/models/model_implementations/uae_models.py +1 -0
- mteb/models/model_implementations/vdr_models.py +2 -0
- mteb/models/model_implementations/vi_vn_models.py +39 -0
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +2 -0
- mteb/models/model_implementations/voyage_models.py +15 -0
- mteb/models/model_implementations/voyage_v.py +8 -2
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +442 -22
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
- mteb/models/search_wrappers.py +165 -48
- mteb/models/sentence_transformer_wrapper.py +2 -7
- mteb/results/benchmark_results.py +88 -47
- mteb/results/model_result.py +11 -4
- mteb/results/task_result.py +37 -19
- mteb/similarity_functions.py +49 -0
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +1 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +2 -1
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +22 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/_encoder_io.py +7 -2
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from datasets import DatasetDict
|
|
4
|
+
|
|
5
|
+
from mteb import TaskMetadata
|
|
6
|
+
from mteb.abstasks import AbsTaskClassification
|
|
7
|
+
from mteb.abstasks._data_filter.filters import (
|
|
8
|
+
deduplicate,
|
|
9
|
+
filter_empty,
|
|
10
|
+
filter_short,
|
|
11
|
+
filter_train_leakage,
|
|
12
|
+
filter_unclear_label,
|
|
13
|
+
split_train_test,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def clean_dataset(
|
|
20
|
+
ds: DatasetDict,
|
|
21
|
+
metadata: TaskMetadata,
|
|
22
|
+
train_split: str,
|
|
23
|
+
input_column: str,
|
|
24
|
+
label_column: str,
|
|
25
|
+
subset: str | None = None,
|
|
26
|
+
) -> DatasetDict:
|
|
27
|
+
"""Apply the full cleaning pipeline with logging."""
|
|
28
|
+
logger.info("[clean_dataset] Starting dataset cleaning pipeline...")
|
|
29
|
+
|
|
30
|
+
transforms = [
|
|
31
|
+
("filter_empty", filter_empty),
|
|
32
|
+
("deduplicate", deduplicate),
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
skip_cjk_codes = {"zho", "jpn", "tha", "mya", "cmn"}
|
|
36
|
+
logger.info("[clean_dataset] Applying short-text filter")
|
|
37
|
+
cur_langs = (
|
|
38
|
+
metadata.eval_langs[subset]
|
|
39
|
+
if isinstance(metadata.eval_langs, dict) and subset
|
|
40
|
+
else metadata.eval_langs
|
|
41
|
+
)
|
|
42
|
+
apply_short = not any(lang.split("-")[0] in skip_cjk_codes for lang in cur_langs)
|
|
43
|
+
if apply_short:
|
|
44
|
+
logger.info("[clean_dataset] Applying short-text filter")
|
|
45
|
+
transforms.append(("filter_short", filter_short))
|
|
46
|
+
|
|
47
|
+
for split in [train_split, *metadata.eval_splits]:
|
|
48
|
+
if split not in ds:
|
|
49
|
+
logger.warning(f"[clean_dataset] Split '{split}' missing; skipping.")
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
for name, fn in transforms:
|
|
53
|
+
before = len(ds[split])
|
|
54
|
+
ds[split] = fn(ds[split], input_column=input_column)
|
|
55
|
+
logger.info(
|
|
56
|
+
f"[clean_dataset:{split}] {name} removed={before - len(ds[split])}"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
ds = split_train_test(ds, metadata, train_split, label_column)
|
|
60
|
+
|
|
61
|
+
for split in metadata.eval_splits:
|
|
62
|
+
if split == train_split:
|
|
63
|
+
continue
|
|
64
|
+
before = len(ds[split])
|
|
65
|
+
ds[split] = filter_train_leakage(ds[train_split], ds[split], input_column)
|
|
66
|
+
logger.info(
|
|
67
|
+
f"[clean_dataset:{split}] leakage_removed={before - len(ds[split])}"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
ds = filter_unclear_label(ds, input_column=input_column, label_column=label_column)
|
|
71
|
+
|
|
72
|
+
logger.info("[clean_dataset] Cleaning pipeline complete.")
|
|
73
|
+
return ds
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def process_classification(
|
|
77
|
+
task: AbsTaskClassification,
|
|
78
|
+
) -> DatasetDict | dict[str, DatasetDict]:
|
|
79
|
+
"""Process classification task dataset(s) with cleaning pipeline."""
|
|
80
|
+
if not task.data_loaded:
|
|
81
|
+
task.load_data()
|
|
82
|
+
if isinstance(task.dataset, DatasetDict):
|
|
83
|
+
return clean_dataset(
|
|
84
|
+
task.dataset,
|
|
85
|
+
task.metadata,
|
|
86
|
+
task.train_split,
|
|
87
|
+
task.input_column_name,
|
|
88
|
+
task.label_column_name,
|
|
89
|
+
subset=None,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
new_ds = {}
|
|
93
|
+
for subset in task.dataset:
|
|
94
|
+
new_ds[subset] = clean_dataset(
|
|
95
|
+
task.dataset[subset],
|
|
96
|
+
task.metadata,
|
|
97
|
+
task.train_split,
|
|
98
|
+
task.input_column_name,
|
|
99
|
+
task.label_column_name,
|
|
100
|
+
subset=subset,
|
|
101
|
+
)
|
|
102
|
+
return new_ds
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import hashlib
|
|
2
4
|
from collections import Counter
|
|
3
|
-
|
|
4
|
-
from PIL import Image
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
5
6
|
|
|
6
7
|
from mteb.types import TopRankedDocumentsType
|
|
7
8
|
from mteb.types.statistics import (
|
|
@@ -13,6 +14,9 @@ from mteb.types.statistics import (
|
|
|
13
14
|
TopRankedStatistics,
|
|
14
15
|
)
|
|
15
16
|
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from PIL import Image
|
|
19
|
+
|
|
16
20
|
|
|
17
21
|
def calculate_text_statistics(texts: list[str]) -> TextStatistics:
|
|
18
22
|
"""Calculate descriptive statistics for a list of texts.
|
mteb/abstasks/classification.py
CHANGED
|
@@ -5,7 +5,6 @@ from typing import Any, TypedDict
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
from datasets import Dataset, DatasetDict
|
|
8
|
-
from PIL import ImageFile
|
|
9
8
|
from sklearn.linear_model import LogisticRegression
|
|
10
9
|
from sklearn.metrics import (
|
|
11
10
|
accuracy_score,
|
|
@@ -32,7 +31,6 @@ from ._statistics_calculation import (
|
|
|
32
31
|
)
|
|
33
32
|
from .abstask import AbsTask
|
|
34
33
|
|
|
35
|
-
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
|
36
34
|
logger = logging.getLogger(__name__)
|
|
37
35
|
|
|
38
36
|
|
mteb/abstasks/clustering.py
CHANGED
|
@@ -89,6 +89,9 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
89
89
|
prediction_folder: Path | None = None,
|
|
90
90
|
**kwargs: Any,
|
|
91
91
|
) -> ScoresDict:
|
|
92
|
+
data_split = data_split.select_columns(
|
|
93
|
+
[self.input_column_name, self.label_column_name]
|
|
94
|
+
)
|
|
92
95
|
# MTEB text clustering requires renaming and eval per subset.
|
|
93
96
|
if self.metadata.modalities == ["text"]:
|
|
94
97
|
all_metrics = []
|
|
@@ -14,6 +14,7 @@ from sklearn.preprocessing import MultiLabelBinarizer
|
|
|
14
14
|
from typing_extensions import override
|
|
15
15
|
|
|
16
16
|
from mteb._create_dataloaders import create_dataloader
|
|
17
|
+
from mteb._evaluators.classification_metrics import hamming_score
|
|
17
18
|
from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
|
|
18
19
|
from mteb.models import EncoderProtocol
|
|
19
20
|
|
|
@@ -40,11 +41,13 @@ class MultilabelClassificationMetrics(TypedDict):
|
|
|
40
41
|
accuracy: Accuracy of the classifier.
|
|
41
42
|
lrap: Label Ranking Average Precision (LRAP) score.
|
|
42
43
|
f1: Macro F1 score.
|
|
44
|
+
hamming: Hamming score (label-based accuracy).
|
|
43
45
|
"""
|
|
44
46
|
|
|
45
47
|
accuracy: float
|
|
46
48
|
lrap: float
|
|
47
49
|
f1: float
|
|
50
|
+
hamming: float
|
|
48
51
|
|
|
49
52
|
|
|
50
53
|
class FullMultilabelClassificationMetrics(MultilabelClassificationMetrics):
|
|
@@ -112,7 +115,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
112
115
|
unique_train_dataset,
|
|
113
116
|
self.metadata,
|
|
114
117
|
input_column=self.input_column_name,
|
|
115
|
-
|
|
118
|
+
**encode_kwargs,
|
|
116
119
|
)
|
|
117
120
|
|
|
118
121
|
logger.info("Running multilabel classification - Encoding training set...")
|
|
@@ -141,7 +144,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
141
144
|
test_dataset.select_columns(self.input_column_name),
|
|
142
145
|
self.metadata,
|
|
143
146
|
input_column=self.input_column_name,
|
|
144
|
-
|
|
147
|
+
**encode_kwargs,
|
|
145
148
|
)
|
|
146
149
|
|
|
147
150
|
logger.info("Running multilabel classification - Encoding test set...")
|
|
@@ -157,7 +160,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
157
160
|
|
|
158
161
|
logger.info("Running multilabel classification - Evaluating classifiers...")
|
|
159
162
|
all_predictions = []
|
|
160
|
-
for
|
|
163
|
+
for _, sample_indices in enumerate(train_samples):
|
|
161
164
|
X_train = np.stack([unique_train_embeddings[idx] for idx in sample_indices])
|
|
162
165
|
y_train = train_split.select(sample_indices)[self.label_column_name]
|
|
163
166
|
y_train = binarizer.transform(y_train)
|
|
@@ -207,10 +210,12 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
207
210
|
else:
|
|
208
211
|
lrap = label_ranking_average_precision_score(y_test, y_pred)
|
|
209
212
|
f1 = f1_score(y_test, y_pred, average="macro")
|
|
213
|
+
hamming = hamming_score(y_test, y_pred)
|
|
210
214
|
return MultilabelClassificationMetrics(
|
|
211
215
|
accuracy=accuracy,
|
|
212
216
|
lrap=lrap,
|
|
213
217
|
f1=f1,
|
|
218
|
+
hamming=hamming,
|
|
214
219
|
)
|
|
215
220
|
|
|
216
221
|
def _undersample_data_indices(
|
|
@@ -218,6 +223,8 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
218
223
|
) -> tuple[list[int], list[int]]:
|
|
219
224
|
"""Undersample data to have samples_per_label samples of each label.
|
|
220
225
|
|
|
226
|
+
Currently ensures that each label has at least samples_per_label samples.
|
|
227
|
+
|
|
221
228
|
Returns:
|
|
222
229
|
A tuple containing:
|
|
223
230
|
- List of sampled indices.
|
|
@@ -19,6 +19,7 @@ from mteb.abstasks._statistics_calculation import (
|
|
|
19
19
|
from mteb.abstasks.abstask import AbsTask
|
|
20
20
|
from mteb.models.model_meta import ScoringFunction
|
|
21
21
|
from mteb.models.models_protocols import EncoderProtocol
|
|
22
|
+
from mteb.types import PromptType
|
|
22
23
|
from mteb.types.statistics import (
|
|
23
24
|
ImageStatistics,
|
|
24
25
|
LabelStatistics,
|
|
@@ -35,7 +36,7 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics):
|
|
|
35
36
|
Attributes:
|
|
36
37
|
num_samples: number of samples in the dataset.
|
|
37
38
|
number_of_characters: Total number of symbols in the dataset.
|
|
38
|
-
|
|
39
|
+
unique_pairs: Number of unique pairs
|
|
39
40
|
|
|
40
41
|
text1_statistics: Statistics for sentence1
|
|
41
42
|
text2_statistics: Statistics for sentence2
|
|
@@ -65,12 +66,16 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
65
66
|
input2_column_name: The name of the column containing the second sentence in the pair.
|
|
66
67
|
label_column_name: The name of the column containing the labels for the pairs. Labels should be 0 or 1.
|
|
67
68
|
abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
|
|
69
|
+
input1_prompt_type: Type of prompt of first input. Used for asymmetric tasks.
|
|
70
|
+
input2_prompt_type: Type of prompt of second input. Used for asymmetric tasks.
|
|
68
71
|
"""
|
|
69
72
|
|
|
70
73
|
abstask_prompt = "Retrieve text that are semantically similar to the given text."
|
|
71
74
|
input1_column_name: str = "sentence1"
|
|
72
75
|
input2_column_name: str = "sentence2"
|
|
73
76
|
label_column_name: str = "labels"
|
|
77
|
+
input1_prompt_type: PromptType | None = None
|
|
78
|
+
input2_prompt_type: PromptType | None = None
|
|
74
79
|
|
|
75
80
|
def _evaluate_subset(
|
|
76
81
|
self,
|
|
@@ -93,6 +98,8 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
93
98
|
task_metadata=self.metadata,
|
|
94
99
|
hf_split=hf_split,
|
|
95
100
|
hf_subset=hf_subset,
|
|
101
|
+
input1_prompt_type=self.input1_prompt_type,
|
|
102
|
+
input2_prompt_type=self.input2_prompt_type,
|
|
96
103
|
**kwargs,
|
|
97
104
|
)
|
|
98
105
|
similarity_scores = evaluator(model, encode_kwargs=encode_kwargs)
|
mteb/abstasks/sts.py
CHANGED
|
@@ -8,6 +8,7 @@ from scipy.stats import pearsonr, spearmanr
|
|
|
8
8
|
from mteb._evaluators import AnySTSEvaluator
|
|
9
9
|
from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
|
|
10
10
|
from mteb.models import EncoderProtocol
|
|
11
|
+
from mteb.types import PromptType
|
|
11
12
|
from mteb.types.statistics import (
|
|
12
13
|
ImageStatistics,
|
|
13
14
|
ScoreStatistics,
|
|
@@ -89,12 +90,16 @@ class AbsTaskSTS(AbsTask):
|
|
|
89
90
|
min_score: Minimum possible score in the dataset.
|
|
90
91
|
max_score: Maximum possible score in the dataset.
|
|
91
92
|
abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
|
|
93
|
+
input1_prompt_type: Type of prompt of first input. Used for asymmetric tasks.
|
|
94
|
+
input2_prompt_type: Type of prompt of second input. Used for asymmetric tasks.
|
|
92
95
|
"""
|
|
93
96
|
|
|
94
97
|
abstask_prompt = "Retrieve semantically similar text."
|
|
95
98
|
column_names: tuple[str, str] = ("sentence1", "sentence2")
|
|
96
99
|
min_score: int = 0
|
|
97
100
|
max_score: int = 5
|
|
101
|
+
input1_prompt_type: PromptType | None = None
|
|
102
|
+
input2_prompt_type: PromptType | None = None
|
|
98
103
|
|
|
99
104
|
def _evaluate_subset(
|
|
100
105
|
self,
|
|
@@ -115,6 +120,8 @@ class AbsTaskSTS(AbsTask):
|
|
|
115
120
|
task_metadata=self.metadata,
|
|
116
121
|
hf_split=hf_split,
|
|
117
122
|
hf_subset=hf_subset,
|
|
123
|
+
input1_prompt_type=self.input1_prompt_type,
|
|
124
|
+
input2_prompt_type=self.input2_prompt_type,
|
|
118
125
|
**kwargs,
|
|
119
126
|
)
|
|
120
127
|
scores = evaluator(model, encode_kwargs=encode_kwargs)
|
mteb/abstasks/task_metadata.py
CHANGED
|
@@ -107,6 +107,7 @@ The domains follow the categories used in the [Universal Dependencies project](h
|
|
|
107
107
|
SampleCreationMethod = Literal[
|
|
108
108
|
"found",
|
|
109
109
|
"created",
|
|
110
|
+
"created and machine-translated",
|
|
110
111
|
"human-translated and localized",
|
|
111
112
|
"human-translated",
|
|
112
113
|
"machine-translated",
|
mteb/benchmarks/_create_table.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import math
|
|
2
1
|
import re
|
|
3
2
|
from collections import defaultdict
|
|
3
|
+
from typing import Literal
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
@@ -32,26 +32,18 @@ def _split_on_capital(s: str) -> str:
|
|
|
32
32
|
return " ".join(re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", s))
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
def _format_n_parameters(n_parameters) ->
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
n_zeros = math.log10(n_thousand)
|
|
42
|
-
if n_zeros >= 6:
|
|
43
|
-
return str(n_thousand // (10**6)) + "B"
|
|
44
|
-
if n_zeros >= 3:
|
|
45
|
-
return str(n_thousand // (10**3)) + "M"
|
|
46
|
-
return str(n_thousand) + "K"
|
|
35
|
+
def _format_n_parameters(n_parameters) -> float | None:
|
|
36
|
+
"""Format n_parameters to be in billions with decimals down to 1 million. I.e. 7M -> 0.007B, 1.5B -> 1.5B, None -> None"""
|
|
37
|
+
if n_parameters:
|
|
38
|
+
n_parameters = float(n_parameters)
|
|
39
|
+
return round(n_parameters / 1e9, 3)
|
|
40
|
+
return None
|
|
47
41
|
|
|
48
42
|
|
|
49
|
-
def _format_max_tokens(max_tokens: float | None) ->
|
|
50
|
-
if max_tokens is None:
|
|
51
|
-
return
|
|
52
|
-
|
|
53
|
-
return "Infinite"
|
|
54
|
-
return str(int(max_tokens))
|
|
43
|
+
def _format_max_tokens(max_tokens: float | None) -> float | None:
|
|
44
|
+
if max_tokens is None or max_tokens == np.inf:
|
|
45
|
+
return None
|
|
46
|
+
return float(max_tokens)
|
|
55
47
|
|
|
56
48
|
|
|
57
49
|
def _get_means_per_types(per_task: pd.DataFrame):
|
|
@@ -144,18 +136,18 @@ def _create_summary_table_from_benchmark_results(
|
|
|
144
136
|
joint_table.insert(
|
|
145
137
|
1,
|
|
146
138
|
"Embedding Dimensions",
|
|
147
|
-
model_metas.map(lambda m:
|
|
139
|
+
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
|
|
148
140
|
)
|
|
149
141
|
joint_table.insert(
|
|
150
142
|
1,
|
|
151
|
-
"Number of Parameters",
|
|
143
|
+
"Number of Parameters (B)",
|
|
152
144
|
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
|
|
153
145
|
)
|
|
154
146
|
joint_table.insert(
|
|
155
147
|
1,
|
|
156
148
|
"Memory Usage (MB)",
|
|
157
149
|
model_metas.map(
|
|
158
|
-
lambda m:
|
|
150
|
+
lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
|
|
159
151
|
),
|
|
160
152
|
)
|
|
161
153
|
|
|
@@ -250,6 +242,65 @@ def _create_per_task_table_from_benchmark_results(
|
|
|
250
242
|
return per_task
|
|
251
243
|
|
|
252
244
|
|
|
245
|
+
def _create_per_language_table_from_benchmark_results(
|
|
246
|
+
benchmark_results: BenchmarkResults,
|
|
247
|
+
language_view: list[str] | Literal["all"],
|
|
248
|
+
) -> pd.DataFrame:
|
|
249
|
+
"""Create per-language table from BenchmarkResults.
|
|
250
|
+
|
|
251
|
+
Returns a DataFrame with one row per model and one column per language.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
benchmark_results: BenchmarkResults object containing model results
|
|
255
|
+
language_view: List of languages to include in the per-language table, or "all" for all languages present in the results
|
|
256
|
+
Returns:
|
|
257
|
+
DataFrame with per-language scores, ready for styling in the leaderboard
|
|
258
|
+
"""
|
|
259
|
+
if language_view != "all" and not isinstance(language_view, list):
|
|
260
|
+
raise ValueError("language_view must be a list of languages or 'all'")
|
|
261
|
+
|
|
262
|
+
data = benchmark_results.to_dataframe(aggregation_level="language", format="long")
|
|
263
|
+
|
|
264
|
+
if data.empty:
|
|
265
|
+
no_results_frame = pd.DataFrame(
|
|
266
|
+
{"No results": ["You can try relaxing your criteria"]}
|
|
267
|
+
)
|
|
268
|
+
return no_results_frame
|
|
269
|
+
|
|
270
|
+
if language_view != "all":
|
|
271
|
+
data = data[data["language"].isin(language_view)]
|
|
272
|
+
|
|
273
|
+
per_language = data.pivot_table(
|
|
274
|
+
index="model_name", columns="language", values="score", aggfunc="mean"
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
to_remove = per_language.isna().all(axis="columns")
|
|
278
|
+
if to_remove.all():
|
|
279
|
+
no_results_frame = pd.DataFrame(
|
|
280
|
+
{"No results": ["You can try relaxing your criteria"]}
|
|
281
|
+
)
|
|
282
|
+
return no_results_frame
|
|
283
|
+
|
|
284
|
+
models_to_remove = list(per_language[to_remove].index)
|
|
285
|
+
per_language = per_language.drop(models_to_remove, axis=0)
|
|
286
|
+
|
|
287
|
+
per_language["borda_rank"] = _get_borda_rank(per_language)
|
|
288
|
+
per_language = per_language.sort_values("borda_rank", ascending=True)
|
|
289
|
+
per_language = per_language.drop(columns=["borda_rank"])
|
|
290
|
+
per_language = per_language.reset_index()
|
|
291
|
+
|
|
292
|
+
per_language["model_name"] = per_language["model_name"].map(
|
|
293
|
+
lambda name: name.split("/")[-1]
|
|
294
|
+
)
|
|
295
|
+
per_language = per_language.rename(
|
|
296
|
+
columns={
|
|
297
|
+
"model_name": "Model",
|
|
298
|
+
}
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
return per_language
|
|
302
|
+
|
|
303
|
+
|
|
253
304
|
def _create_summary_table_mean_public_private(
|
|
254
305
|
benchmark_results: BenchmarkResults,
|
|
255
306
|
) -> pd.DataFrame:
|
|
@@ -323,18 +374,18 @@ def _create_summary_table_mean_public_private(
|
|
|
323
374
|
joint_table.insert(
|
|
324
375
|
1,
|
|
325
376
|
"Embedding Dimensions",
|
|
326
|
-
model_metas.map(lambda m:
|
|
377
|
+
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
|
|
327
378
|
)
|
|
328
379
|
joint_table.insert(
|
|
329
380
|
1,
|
|
330
|
-
"Number of Parameters",
|
|
381
|
+
"Number of Parameters (B)",
|
|
331
382
|
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
|
|
332
383
|
)
|
|
333
384
|
joint_table.insert(
|
|
334
385
|
1,
|
|
335
386
|
"Memory Usage (MB)",
|
|
336
387
|
model_metas.map(
|
|
337
|
-
lambda m:
|
|
388
|
+
lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
|
|
338
389
|
),
|
|
339
390
|
)
|
|
340
391
|
|
|
@@ -358,9 +409,7 @@ def _create_summary_table_mean_public_private(
|
|
|
358
409
|
"mean(public)": "Mean (Public)",
|
|
359
410
|
"mean(private)": "Mean (Private)",
|
|
360
411
|
}
|
|
361
|
-
|
|
362
|
-
if "Retrieval" in joint_table.columns:
|
|
363
|
-
rename_dict["Retrieval"] = "Mean (Task)"
|
|
412
|
+
|
|
364
413
|
joint_table = joint_table.rename(columns=rename_dict)
|
|
365
414
|
|
|
366
415
|
# Move borda rank to front
|
|
@@ -447,18 +496,18 @@ def _create_summary_table_mean_subset(
|
|
|
447
496
|
joint_table.insert(
|
|
448
497
|
1,
|
|
449
498
|
"Embedding Dimensions",
|
|
450
|
-
model_metas.map(lambda m:
|
|
499
|
+
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
|
|
451
500
|
)
|
|
452
501
|
joint_table.insert(
|
|
453
502
|
1,
|
|
454
|
-
"Number of Parameters",
|
|
503
|
+
"Number of Parameters (B)",
|
|
455
504
|
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
|
|
456
505
|
)
|
|
457
506
|
joint_table.insert(
|
|
458
507
|
1,
|
|
459
508
|
"Memory Usage (MB)",
|
|
460
509
|
model_metas.map(
|
|
461
|
-
lambda m:
|
|
510
|
+
lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
|
|
462
511
|
),
|
|
463
512
|
)
|
|
464
513
|
|
|
@@ -560,25 +609,23 @@ def _create_summary_table_mean_task_type(
|
|
|
560
609
|
|
|
561
610
|
# Insert model metadata columns
|
|
562
611
|
joint_table.insert(
|
|
563
|
-
1,
|
|
564
|
-
"Max Tokens",
|
|
565
|
-
model_metas.map(lambda m: _format_max_tokens(m.max_tokens)),
|
|
612
|
+
1, "Max Tokens", model_metas.map(lambda m: _format_max_tokens(m.max_tokens))
|
|
566
613
|
)
|
|
567
614
|
joint_table.insert(
|
|
568
615
|
1,
|
|
569
616
|
"Embedding Dimensions",
|
|
570
|
-
model_metas.map(lambda m:
|
|
617
|
+
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
|
|
571
618
|
)
|
|
572
619
|
joint_table.insert(
|
|
573
620
|
1,
|
|
574
|
-
"Number of Parameters",
|
|
621
|
+
"Number of Parameters (B)",
|
|
575
622
|
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
|
|
576
623
|
)
|
|
577
624
|
joint_table.insert(
|
|
578
625
|
1,
|
|
579
626
|
"Memory Usage (MB)",
|
|
580
627
|
model_metas.map(
|
|
581
|
-
lambda m:
|
|
628
|
+
lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
|
|
582
629
|
),
|
|
583
630
|
)
|
|
584
631
|
|
mteb/benchmarks/benchmark.py
CHANGED
|
@@ -1,21 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from collections.abc import Iterable, Sequence
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from typing import TYPE_CHECKING
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import TYPE_CHECKING, Literal
|
|
4
6
|
|
|
5
7
|
import pandas as pd
|
|
6
8
|
|
|
7
|
-
from mteb.
|
|
8
|
-
_create_per_task_table_from_benchmark_results,
|
|
9
|
-
_create_summary_table_from_benchmark_results,
|
|
10
|
-
_create_summary_table_mean_public_private,
|
|
11
|
-
_create_summary_table_mean_subset,
|
|
12
|
-
_create_summary_table_mean_task_type,
|
|
13
|
-
)
|
|
14
|
-
from mteb.results import BenchmarkResults
|
|
9
|
+
from mteb.abstasks.abstask import AbsTask
|
|
15
10
|
from mteb.types import StrURL
|
|
16
11
|
|
|
17
12
|
if TYPE_CHECKING:
|
|
18
|
-
from mteb.
|
|
13
|
+
from mteb.results import BenchmarkResults
|
|
19
14
|
|
|
20
15
|
|
|
21
16
|
@dataclass
|
|
@@ -42,7 +37,7 @@ class Benchmark:
|
|
|
42
37
|
"""
|
|
43
38
|
|
|
44
39
|
name: str
|
|
45
|
-
tasks: Sequence[
|
|
40
|
+
tasks: Sequence[AbsTask]
|
|
46
41
|
description: str | None = None
|
|
47
42
|
reference: StrURL | None = None
|
|
48
43
|
citation: str | None = None
|
|
@@ -50,14 +45,15 @@ class Benchmark:
|
|
|
50
45
|
display_on_leaderboard: bool = True
|
|
51
46
|
icon: str | None = None
|
|
52
47
|
display_name: str | None = None
|
|
48
|
+
language_view: list[str] | Literal["all"] = field(default_factory=list)
|
|
53
49
|
|
|
54
|
-
def __iter__(self) -> Iterable[
|
|
50
|
+
def __iter__(self) -> Iterable[AbsTask]:
|
|
55
51
|
return iter(self.tasks)
|
|
56
52
|
|
|
57
53
|
def __len__(self) -> int:
|
|
58
54
|
return len(self.tasks)
|
|
59
55
|
|
|
60
|
-
def __getitem__(self, index: int) ->
|
|
56
|
+
def __getitem__(self, index: int) -> AbsTask:
|
|
61
57
|
return self.tasks[index]
|
|
62
58
|
|
|
63
59
|
def _create_summary_table(
|
|
@@ -68,6 +64,10 @@ class Benchmark:
|
|
|
68
64
|
Returns:
|
|
69
65
|
A pandas DataFrame representing the summary results.
|
|
70
66
|
"""
|
|
67
|
+
from mteb.benchmarks._create_table import (
|
|
68
|
+
_create_summary_table_from_benchmark_results,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
71
|
return _create_summary_table_from_benchmark_results(benchmark_results)
|
|
72
72
|
|
|
73
73
|
def _create_per_task_table(
|
|
@@ -78,8 +78,38 @@ class Benchmark:
|
|
|
78
78
|
Returns:
|
|
79
79
|
A pandas DataFrame representing the per-task results.
|
|
80
80
|
"""
|
|
81
|
+
from mteb.benchmarks._create_table import (
|
|
82
|
+
_create_per_task_table_from_benchmark_results,
|
|
83
|
+
)
|
|
84
|
+
|
|
81
85
|
return _create_per_task_table_from_benchmark_results(benchmark_results)
|
|
82
86
|
|
|
87
|
+
def _create_per_language_table(
|
|
88
|
+
self, benchmark_results: BenchmarkResults
|
|
89
|
+
) -> pd.DataFrame:
|
|
90
|
+
"""Create per-language table. Called by the leaderboard app.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
A pandas DataFrame representing the per-language results.
|
|
94
|
+
"""
|
|
95
|
+
from mteb.benchmarks._create_table import (
|
|
96
|
+
_create_per_language_table_from_benchmark_results,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
if self.language_view == "all" or len(self.language_view) > 0:
|
|
100
|
+
return _create_per_language_table_from_benchmark_results(
|
|
101
|
+
benchmark_results, self.language_view
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
no_results_frame = pd.DataFrame(
|
|
105
|
+
{
|
|
106
|
+
"No results": [
|
|
107
|
+
"The per-language table is not available for this benchmark."
|
|
108
|
+
]
|
|
109
|
+
}
|
|
110
|
+
)
|
|
111
|
+
return no_results_frame
|
|
112
|
+
|
|
83
113
|
|
|
84
114
|
class RtebBenchmark(Benchmark):
|
|
85
115
|
"""Wrapper for RTEB benchmark."""
|
|
@@ -87,7 +117,14 @@ class RtebBenchmark(Benchmark):
|
|
|
87
117
|
def _create_summary_table(
|
|
88
118
|
self, benchmark_results: BenchmarkResults
|
|
89
119
|
) -> pd.DataFrame:
|
|
90
|
-
|
|
120
|
+
from mteb.benchmarks._create_table import (
|
|
121
|
+
_create_summary_table_mean_public_private,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
joint_table = _create_summary_table_mean_public_private(benchmark_results)
|
|
125
|
+
# For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
|
|
126
|
+
joint_table = joint_table.rename(columns={"Retrieval": "Mean (Task)"})
|
|
127
|
+
return joint_table
|
|
91
128
|
|
|
92
129
|
|
|
93
130
|
class HUMEBenchmark(Benchmark):
|
|
@@ -96,6 +133,8 @@ class HUMEBenchmark(Benchmark):
|
|
|
96
133
|
def _create_summary_table(
|
|
97
134
|
self, benchmark_results: BenchmarkResults
|
|
98
135
|
) -> pd.DataFrame:
|
|
136
|
+
from mteb.benchmarks._create_table import _create_summary_table_mean_subset
|
|
137
|
+
|
|
99
138
|
return _create_summary_table_mean_subset(benchmark_results)
|
|
100
139
|
|
|
101
140
|
|
|
@@ -105,4 +144,24 @@ class MIEBBenchmark(Benchmark):
|
|
|
105
144
|
def _create_summary_table(
|
|
106
145
|
self, benchmark_results: BenchmarkResults
|
|
107
146
|
) -> pd.DataFrame:
|
|
147
|
+
from mteb.benchmarks._create_table import _create_summary_table_mean_task_type
|
|
148
|
+
|
|
108
149
|
return _create_summary_table_mean_task_type(benchmark_results)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class VidoreBenchmark(Benchmark):
|
|
153
|
+
"""Wrapper for Vidore3 benchmark."""
|
|
154
|
+
|
|
155
|
+
def _create_summary_table(
|
|
156
|
+
self, benchmark_results: BenchmarkResults
|
|
157
|
+
) -> pd.DataFrame:
|
|
158
|
+
from mteb.benchmarks._create_table import (
|
|
159
|
+
_create_summary_table_mean_public_private,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
joint_table = _create_summary_table_mean_public_private(benchmark_results)
|
|
163
|
+
# For ViDoRe (V1, V2, V3): all tasks are Document Understanding type, so Document Understanding column = Mean (Task)
|
|
164
|
+
joint_table = joint_table.rename(
|
|
165
|
+
columns={"Document Understanding": "Mean (Task)"}
|
|
166
|
+
)
|
|
167
|
+
return joint_table
|