mteb 2.0.5__py3-none-any.whl → 2.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +10 -1
- mteb/_create_dataloaders.py +8 -3
- mteb/_evaluators/any_sts_evaluator.py +14 -12
- mteb/_evaluators/clustering_evaluator.py +1 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +2 -2
- mteb/_evaluators/pair_classification_evaluator.py +3 -1
- mteb/_evaluators/retrieval_metrics.py +0 -9
- mteb/_evaluators/sklearn_evaluator.py +15 -28
- mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
- mteb/_evaluators/text/summarization_evaluator.py +4 -2
- mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
- mteb/abstasks/_stratification.py +1 -1
- mteb/abstasks/abstask.py +6 -1
- mteb/abstasks/clustering.py +1 -1
- mteb/abstasks/dataset_card_template.md +1 -1
- mteb/abstasks/multilabel_classification.py +2 -2
- mteb/abstasks/retrieval.py +2 -1
- mteb/abstasks/retrieval_dataset_loaders.py +1 -1
- mteb/abstasks/task_metadata.py +2 -1
- mteb/benchmarks/_create_table.py +1 -3
- mteb/benchmarks/benchmark.py +18 -1
- mteb/benchmarks/benchmarks/__init__.py +4 -0
- mteb/benchmarks/benchmarks/benchmarks.py +125 -16
- mteb/benchmarks/get_benchmark.py +3 -1
- mteb/cache.py +7 -3
- mteb/descriptive_stats/Classification/DutchColaClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json +90 -0
- mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json +54 -0
- mteb/descriptive_stats/Classification/IconclassClassification.json +96 -0
- mteb/descriptive_stats/Classification/OpenTenderClassification.json +222 -0
- mteb/descriptive_stats/Classification/VaccinChatNLClassification.json +1068 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringP2P.json +45 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringS2S.json +45 -0
- mteb/descriptive_stats/Clustering/IconclassClusteringS2S.json +48 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringP2P.json +111 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringS2S.json +111 -0
- mteb/descriptive_stats/Clustering/VABBClusteringP2P.json +60 -0
- mteb/descriptive_stats/Clustering/VABBClusteringS2S.json +60 -0
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XFlickr30kCoT2IRetrieval.json +243 -153
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XM3600T2IRetrieval.json +999 -629
- mteb/descriptive_stats/Image/Any2AnyRetrieval/OVENIT2TRetrieval.json +33 -17
- mteb/descriptive_stats/Image/DocumentUnderstanding/MIRACLVisionRetrieval.json +574 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/MultilabelClassification/CovidDisinformationNLMultiLabelClassification.json +84 -0
- mteb/descriptive_stats/MultilabelClassification/VABBMultiLabelClassification.json +156 -0
- mteb/descriptive_stats/PairClassification/SICKNLPairClassification.json +35 -0
- mteb/descriptive_stats/PairClassification/XLWICNLPairClassification.json +35 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/ClimateFEVERHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/DBPediaHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/DutchNewsArticlesRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/FEVERHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/HotpotQAHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/LegalQANLRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/OpenTenderRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/QuoraRetrievalHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/RiaNewsRetrievalHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/VABBRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json +184 -0
- mteb/descriptive_stats/Retrieval/WinoGrande.json +14 -14
- mteb/descriptive_stats/Retrieval/bBSARDNLRetrieval.json +30 -0
- mteb/descriptive_stats/STS/SICK-NL-STS.json +28 -0
- mteb/evaluate.py +26 -6
- mteb/languages/check_language_code.py +11 -3
- mteb/languages/language_scripts.py +4 -0
- mteb/leaderboard/app.py +5 -3
- mteb/leaderboard/benchmark_selector.py +4 -2
- mteb/leaderboard/text_segments.py +1 -1
- mteb/models/cache_wrappers/cache_wrapper.py +1 -1
- mteb/models/instruct_wrapper.py +3 -0
- mteb/models/model_implementations/align_models.py +6 -0
- mteb/models/model_implementations/andersborges.py +51 -0
- mteb/models/model_implementations/ara_models.py +7 -0
- mteb/models/model_implementations/b1ade_models.py +1 -1
- mteb/models/model_implementations/bge_models.py +1 -3
- mteb/models/model_implementations/blip2_models.py +9 -0
- mteb/models/model_implementations/blip_models.py +19 -0
- mteb/models/model_implementations/bmretriever_models.py +1 -1
- mteb/models/model_implementations/cadet_models.py +8 -0
- mteb/models/model_implementations/cde_models.py +12 -0
- mteb/models/model_implementations/codefuse_models.py +15 -0
- mteb/models/model_implementations/codesage_models.py +12 -0
- mteb/models/model_implementations/cohere_models.py +1 -1
- mteb/models/model_implementations/colqwen_models.py +57 -0
- mteb/models/model_implementations/emillykkejensen_models.py +70 -0
- mteb/models/model_implementations/gme_v_models.py +2 -2
- mteb/models/model_implementations/ibm_granite_models.py +1 -1
- mteb/models/model_implementations/inf_models.py +3 -3
- mteb/models/model_implementations/jasper_models.py +253 -2
- mteb/models/model_implementations/jina_models.py +12 -2
- mteb/models/model_implementations/kalm_models.py +159 -25
- mteb/models/model_implementations/llm2vec_models.py +1 -1
- mteb/models/model_implementations/misc_models.py +8 -2
- mteb/models/model_implementations/moco_models.py +9 -0
- mteb/models/model_implementations/mxbai_models.py +1 -1
- mteb/models/model_implementations/openclip_models.py +16 -0
- mteb/models/model_implementations/piccolo_models.py +6 -0
- mteb/models/model_implementations/rasgaard_models.py +33 -0
- mteb/models/model_implementations/reasonir_model.py +1 -1
- mteb/models/model_implementations/salesforce_models.py +1 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +26 -0
- mteb/models/model_implementations/tarka_models.py +374 -0
- mteb/models/model_implementations/voyage_models.py +6 -7
- mteb/models/model_implementations/voyage_v.py +10 -9
- mteb/models/model_implementations/yuan_models.py +33 -0
- mteb/models/search_wrappers.py +6 -5
- mteb/results/task_result.py +19 -17
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +1 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +2 -3
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +15 -121
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +1 -2
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +1 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +2 -3
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/__init__.py +16 -0
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +41 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +40 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +33 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +39 -0
- mteb/tasks/classification/nld/iconclass_classification.py +44 -0
- mteb/tasks/classification/nld/open_tender_classification.py +41 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +49 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/__init__.py +1 -0
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/__init__.py +17 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +40 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +40 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +50 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +54 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +44 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +54 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +54 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/multilabel_classification/__init__.py +1 -0
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/__init__.py +9 -0
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +91 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +47 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/__init__.py +1 -0
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py +9 -8
- mteb/tasks/pair_classification/nld/__init__.py +7 -0
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +39 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +44 -0
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/code_rag.py +8 -8
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +18 -4
- mteb/tasks/retrieval/eng/climate_fever_retrieval.py +68 -77
- mteb/tasks/retrieval/eng/dbpedia_retrieval.py +55 -50
- mteb/tasks/retrieval/eng/fever_retrieval.py +62 -67
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/hotpot_qa_retrieval.py +57 -67
- mteb/tasks/retrieval/eng/legal_summarization_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +0 -3
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +0 -2
- mteb/tasks/retrieval/eng/oven_it2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/quora_retrieval.py +51 -46
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/eng/wino_grande_retrieval.py +1 -1
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/multilingual/__init__.py +22 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +6 -5
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/miracl_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +2 -9
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +6 -5
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +3 -4
- mteb/tasks/retrieval/nld/__init__.py +18 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +44 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +33 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +42 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +41 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +44 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/rus/__init__.py +11 -2
- mteb/tasks/retrieval/rus/ria_news_retrieval.py +48 -44
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/tur/tur_hist_quad.py +2 -2
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/sts/__init__.py +1 -0
- mteb/tasks/sts/nld/__init__.py +5 -0
- mteb/tasks/sts/nld/sick_nl_sts.py +42 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb-2.1.19.dist-info/METADATA +253 -0
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/RECORD +398 -330
- mteb/descriptive_stats/Classification/PersianTextTone.json +0 -56
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json +0 -37
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json +0 -25
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json +0 -25
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json +0 -25
- mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json +0 -220
- mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json +0 -402
- mteb/descriptive_stats/Reranking/InstructIR.json +0 -31
- mteb-2.0.5.dist-info/METADATA +0 -455
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/WHEEL +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/entry_points.txt +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/top_level.txt +0 -0
mteb/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from importlib.metadata import version
|
|
2
2
|
|
|
3
|
+
from mteb import types
|
|
3
4
|
from mteb.abstasks import AbsTask
|
|
4
5
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
5
6
|
from mteb.deprecated_evaluator import MTEB
|
|
@@ -7,7 +8,12 @@ from mteb.evaluate import evaluate
|
|
|
7
8
|
from mteb.filter_tasks import filter_tasks
|
|
8
9
|
from mteb.get_tasks import get_task, get_tasks
|
|
9
10
|
from mteb.load_results import load_results
|
|
10
|
-
from mteb.models import
|
|
11
|
+
from mteb.models import (
|
|
12
|
+
CrossEncoderProtocol,
|
|
13
|
+
EncoderProtocol,
|
|
14
|
+
SearchProtocol,
|
|
15
|
+
SentenceTransformerEncoderWrapper,
|
|
16
|
+
)
|
|
11
17
|
from mteb.models.get_model_meta import get_model, get_model_meta, get_model_metas
|
|
12
18
|
from mteb.results import BenchmarkResults, TaskResult
|
|
13
19
|
|
|
@@ -21,7 +27,9 @@ __all__ = [
|
|
|
21
27
|
"AbsTask",
|
|
22
28
|
"Benchmark",
|
|
23
29
|
"BenchmarkResults",
|
|
30
|
+
"CrossEncoderProtocol",
|
|
24
31
|
"EncoderProtocol",
|
|
32
|
+
"SearchProtocol",
|
|
25
33
|
"SentenceTransformerEncoderWrapper",
|
|
26
34
|
"TaskMetadata",
|
|
27
35
|
"TaskResult",
|
|
@@ -35,4 +43,5 @@ __all__ = [
|
|
|
35
43
|
"get_task",
|
|
36
44
|
"get_tasks",
|
|
37
45
|
"load_results",
|
|
46
|
+
"types",
|
|
38
47
|
]
|
mteb/_create_dataloaders.py
CHANGED
|
@@ -3,7 +3,7 @@ from collections.abc import Callable
|
|
|
3
3
|
from typing import Any, cast
|
|
4
4
|
|
|
5
5
|
import torch
|
|
6
|
-
from datasets import Dataset
|
|
6
|
+
from datasets import Dataset, Image
|
|
7
7
|
from torch.utils.data import DataLoader, default_collate
|
|
8
8
|
|
|
9
9
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
@@ -22,12 +22,14 @@ logger = logging.getLogger(__name__)
|
|
|
22
22
|
def _create_dataloader_from_texts(
|
|
23
23
|
text: list[str],
|
|
24
24
|
batch_size: int = 32,
|
|
25
|
+
**kwargs: dict[str, Any],
|
|
25
26
|
) -> DataLoader[TextInput]:
|
|
26
27
|
"""Create a dataloader from a list of text.
|
|
27
28
|
|
|
28
29
|
Args:
|
|
29
30
|
text: A list of text to create a dataloader from.
|
|
30
31
|
batch_size: Batch size for the dataloader.
|
|
32
|
+
kwargs: Not used, present catching extra arguments.
|
|
31
33
|
|
|
32
34
|
Returns:
|
|
33
35
|
A dataloader with the text.
|
|
@@ -244,14 +246,15 @@ def _prepare_image_dataset(
|
|
|
244
246
|
transform: Callable[[Any], Any] | None = None,
|
|
245
247
|
) -> Dataset:
|
|
246
248
|
"""Prepare the image dataset by converting images to RGB and applying transformations."""
|
|
247
|
-
# If the dataset uses a different column name for images, rename it to "image".
|
|
248
249
|
if (
|
|
249
250
|
image_column_name
|
|
250
251
|
and image_column_name in dataset.column_names
|
|
251
252
|
and "image" not in dataset.column_names
|
|
252
253
|
):
|
|
253
254
|
dataset = dataset.rename_column(image_column_name, "image")
|
|
254
|
-
#
|
|
255
|
+
# don't process image if it's already in the correct format
|
|
256
|
+
if isinstance(dataset.features["image"], Image):
|
|
257
|
+
return dataset
|
|
255
258
|
return dataset.map(
|
|
256
259
|
_convert_images_to_rgb,
|
|
257
260
|
fn_kwargs={"image_col_name": "image", "transform": transform},
|
|
@@ -277,6 +280,8 @@ def _custom_collate_fn(batch: list[dict[str, Any]]) -> dict[str, Any]:
|
|
|
277
280
|
# Leave the images as a list to avoid stacking errors.
|
|
278
281
|
collated[key] = [item[key] for item in batch]
|
|
279
282
|
else:
|
|
283
|
+
if any(item[key] is None for item in batch):
|
|
284
|
+
raise ValueError(f"Found None in batch for key '{key}'")
|
|
280
285
|
collated[key] = default_collate([item[key] for item in batch])
|
|
281
286
|
return collated
|
|
282
287
|
|
|
@@ -45,16 +45,8 @@ class AnySTSEvaluator(Evaluator):
|
|
|
45
45
|
**kwargs,
|
|
46
46
|
) -> None:
|
|
47
47
|
super().__init__(**kwargs)
|
|
48
|
-
self.
|
|
49
|
-
|
|
50
|
-
task_metadata,
|
|
51
|
-
input_column=sentences_column_names[0],
|
|
52
|
-
)
|
|
53
|
-
self.second_column = create_dataloader(
|
|
54
|
-
dataset,
|
|
55
|
-
task_metadata,
|
|
56
|
-
input_column=sentences_column_names[1],
|
|
57
|
-
)
|
|
48
|
+
self.dataset = dataset
|
|
49
|
+
self.input_columns = sentences_column_names
|
|
58
50
|
self.task_metadata = task_metadata
|
|
59
51
|
self.hf_split = hf_split
|
|
60
52
|
self.hf_subset = hf_subset
|
|
@@ -67,7 +59,12 @@ class AnySTSEvaluator(Evaluator):
|
|
|
67
59
|
) -> STSEvaluatorScores:
|
|
68
60
|
logger.info("Running semantic similarity - Encoding samples (1/2)")
|
|
69
61
|
embeddings1 = model.encode(
|
|
70
|
-
|
|
62
|
+
create_dataloader(
|
|
63
|
+
self.dataset,
|
|
64
|
+
self.task_metadata,
|
|
65
|
+
input_column=self.input_columns[0],
|
|
66
|
+
**encode_kwargs,
|
|
67
|
+
),
|
|
71
68
|
task_metadata=self.task_metadata,
|
|
72
69
|
hf_split=self.hf_split,
|
|
73
70
|
hf_subset=self.hf_subset,
|
|
@@ -76,7 +73,12 @@ class AnySTSEvaluator(Evaluator):
|
|
|
76
73
|
|
|
77
74
|
logger.info("Running semantic similarity - Encoding samples (2/2)...")
|
|
78
75
|
embeddings2 = model.encode(
|
|
79
|
-
|
|
76
|
+
create_dataloader(
|
|
77
|
+
self.dataset,
|
|
78
|
+
self.task_metadata,
|
|
79
|
+
input_column=self.input_columns[1],
|
|
80
|
+
**encode_kwargs,
|
|
81
|
+
),
|
|
80
82
|
task_metadata=self.task_metadata,
|
|
81
83
|
hf_split=self.hf_split,
|
|
82
84
|
hf_subset=self.hf_subset,
|
|
@@ -103,7 +103,7 @@ class ImageTextPairClassificationEvaluator(Evaluator):
|
|
|
103
103
|
text_embeddings = model.encode(
|
|
104
104
|
DataLoader(
|
|
105
105
|
Dataset.from_dict({"text": texts}),
|
|
106
|
-
|
|
106
|
+
**encode_kwargs,
|
|
107
107
|
),
|
|
108
108
|
task_metadata=self.task_metadata,
|
|
109
109
|
hf_subset=self.hf_subset,
|
|
@@ -122,8 +122,8 @@ class ImageTextPairClassificationEvaluator(Evaluator):
|
|
|
122
122
|
image_embeddings = model.encode(
|
|
123
123
|
DataLoader(
|
|
124
124
|
CustomImageDataset(images),
|
|
125
|
-
batch_size=encode_kwargs["batch_size"],
|
|
126
125
|
collate_fn=lambda x: {"image": [item["image"] for item in x]},
|
|
126
|
+
**encode_kwargs,
|
|
127
127
|
),
|
|
128
128
|
task_metadata=self.task_metadata,
|
|
129
129
|
hf_subset=self.hf_subset,
|
|
@@ -106,6 +106,7 @@ class PairClassificationEvaluator(Evaluator):
|
|
|
106
106
|
self.dataset,
|
|
107
107
|
task_metadata=self.task_metadata,
|
|
108
108
|
input_column=self.input1_column_name,
|
|
109
|
+
**encode_kwargs,
|
|
109
110
|
),
|
|
110
111
|
task_metadata=self.task_metadata,
|
|
111
112
|
hf_split=self.hf_split,
|
|
@@ -117,6 +118,7 @@ class PairClassificationEvaluator(Evaluator):
|
|
|
117
118
|
self.dataset,
|
|
118
119
|
task_metadata=self.task_metadata,
|
|
119
120
|
input_column=self.input2_column_name,
|
|
121
|
+
**encode_kwargs,
|
|
120
122
|
),
|
|
121
123
|
task_metadata=self.task_metadata,
|
|
122
124
|
hf_split=self.hf_split,
|
|
@@ -168,7 +170,7 @@ class PairClassificationEvaluator(Evaluator):
|
|
|
168
170
|
)
|
|
169
171
|
all_unique_texts_embs = np.asarray(
|
|
170
172
|
model.encode(
|
|
171
|
-
_create_dataloader_from_texts(all_unique_texts),
|
|
173
|
+
_create_dataloader_from_texts(all_unique_texts, **encode_kwargs),
|
|
172
174
|
task_metadata=task_metadata,
|
|
173
175
|
hf_split=hf_split,
|
|
174
176
|
hf_subset=hf_subset,
|
|
@@ -5,7 +5,6 @@ from typing import Any
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import pytrec_eval
|
|
8
|
-
import torch
|
|
9
8
|
from packaging.version import Version
|
|
10
9
|
from sklearn.metrics import auc
|
|
11
10
|
|
|
@@ -14,14 +13,6 @@ from mteb.types import RelevantDocumentsType, RetrievalEvaluationResult
|
|
|
14
13
|
logger = logging.getLogger(__name__)
|
|
15
14
|
|
|
16
15
|
|
|
17
|
-
try:
|
|
18
|
-
# speeds up computation if available
|
|
19
|
-
torch.set_float32_matmul_precision("high")
|
|
20
|
-
logger.info("Setting torch float32 matmul precision to high for a speedup")
|
|
21
|
-
except Exception:
|
|
22
|
-
pass
|
|
23
|
-
|
|
24
|
-
|
|
25
16
|
def mrr(
|
|
26
17
|
qrels: RelevantDocumentsType,
|
|
27
18
|
results: dict[str, dict[str, float]],
|
|
@@ -6,7 +6,7 @@ from datasets import Dataset
|
|
|
6
6
|
from torch.utils.data import DataLoader
|
|
7
7
|
from typing_extensions import Self
|
|
8
8
|
|
|
9
|
-
from mteb._create_dataloaders import
|
|
9
|
+
from mteb._create_dataloaders import create_dataloader
|
|
10
10
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
11
11
|
from mteb.models import EncoderProtocol
|
|
12
12
|
from mteb.types import BatchedInput
|
|
@@ -50,33 +50,20 @@ class SklearnEvaluator(Evaluator):
|
|
|
50
50
|
self.evaluator_model = evaluator_model
|
|
51
51
|
|
|
52
52
|
def create_dataloaders(
|
|
53
|
-
self,
|
|
53
|
+
self, encode_kwargs: dict[str, Any]
|
|
54
54
|
) -> tuple[DataLoader[BatchedInput], DataLoader[BatchedInput]]:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
if self.values_column_name != "text":
|
|
68
|
-
self.train_dataset = self.train_dataset.rename_column(
|
|
69
|
-
self.values_column_name, "text"
|
|
70
|
-
)
|
|
71
|
-
self.eval_dataset = self.eval_dataset.rename_column(
|
|
72
|
-
self.values_column_name, "text"
|
|
73
|
-
)
|
|
74
|
-
dataloader_train = DataLoader(self.train_dataset)
|
|
75
|
-
dataloader_test = DataLoader(self.eval_dataset)
|
|
76
|
-
else:
|
|
77
|
-
raise ValueError(
|
|
78
|
-
"ClassificationEvaluator only supports image and text modalities."
|
|
79
|
-
)
|
|
55
|
+
dataloader_train = create_dataloader(
|
|
56
|
+
self.train_dataset,
|
|
57
|
+
self.task_metadata,
|
|
58
|
+
input_column=self.values_column_name,
|
|
59
|
+
**encode_kwargs,
|
|
60
|
+
)
|
|
61
|
+
dataloader_test = create_dataloader(
|
|
62
|
+
self.eval_dataset,
|
|
63
|
+
self.task_metadata,
|
|
64
|
+
input_column=self.values_column_name,
|
|
65
|
+
**encode_kwargs,
|
|
66
|
+
)
|
|
80
67
|
return dataloader_train, dataloader_test
|
|
81
68
|
|
|
82
69
|
def __call__( # type: ignore[override]
|
|
@@ -98,7 +85,7 @@ class SklearnEvaluator(Evaluator):
|
|
|
98
85
|
|
|
99
86
|
"""
|
|
100
87
|
dataloader_train, dataloader_test = self.create_dataloaders(
|
|
101
|
-
|
|
88
|
+
encode_kwargs=encode_kwargs,
|
|
102
89
|
)
|
|
103
90
|
|
|
104
91
|
logger.info("Running - Encoding samples...")
|
|
@@ -46,7 +46,10 @@ class BitextMiningEvaluator(Evaluator):
|
|
|
46
46
|
|
|
47
47
|
embeddings = {}
|
|
48
48
|
for sub in tqdm(subsets):
|
|
49
|
-
dataloader = _create_dataloader_from_texts(
|
|
49
|
+
dataloader = _create_dataloader_from_texts(
|
|
50
|
+
self.sentences[sub],
|
|
51
|
+
**encode_kwargs,
|
|
52
|
+
)
|
|
50
53
|
embeddings[sub] = model.encode(
|
|
51
54
|
dataloader,
|
|
52
55
|
task_metadata=self.task_metadata,
|
|
@@ -109,7 +109,8 @@ class SummarizationEvaluator(Evaluator):
|
|
|
109
109
|
summary
|
|
110
110
|
for human_summaries in self.human_summaries
|
|
111
111
|
for summary in human_summaries
|
|
112
|
-
]
|
|
112
|
+
],
|
|
113
|
+
**encode_kwargs,
|
|
113
114
|
),
|
|
114
115
|
task_metadata=self.task_metadata,
|
|
115
116
|
hf_subset=self.hf_subset,
|
|
@@ -124,7 +125,8 @@ class SummarizationEvaluator(Evaluator):
|
|
|
124
125
|
summary
|
|
125
126
|
for machine_summaries in self.machine_summaries
|
|
126
127
|
for summary in machine_summaries
|
|
127
|
-
]
|
|
128
|
+
],
|
|
129
|
+
**encode_kwargs,
|
|
128
130
|
),
|
|
129
131
|
task_metadata=self.task_metadata,
|
|
130
132
|
hf_subset=self.hf_subset,
|
|
@@ -42,14 +42,14 @@ class ZeroShotClassificationEvaluator(Evaluator):
|
|
|
42
42
|
) -> Array:
|
|
43
43
|
dataloader = create_dataloader(
|
|
44
44
|
self.dataset,
|
|
45
|
-
batch_size=encode_kwargs["batch_size"],
|
|
46
45
|
input_column=self.input_column_name,
|
|
47
46
|
task_metadata=self.task_metadata,
|
|
47
|
+
**encode_kwargs,
|
|
48
48
|
)
|
|
49
49
|
|
|
50
50
|
logger.info("Running zero-shot classification - Encoding labels...")
|
|
51
51
|
text_label_embeddings = model.encode(
|
|
52
|
-
_create_dataloader_from_texts(self.candidate_labels),
|
|
52
|
+
_create_dataloader_from_texts(self.candidate_labels, **encode_kwargs),
|
|
53
53
|
task_metadata=self.task_metadata,
|
|
54
54
|
hf_subset=self.hf_subset,
|
|
55
55
|
hf_split=self.hf_split,
|
mteb/abstasks/_stratification.py
CHANGED
|
@@ -134,7 +134,7 @@ def _get_most_desired_combination(samples_with_combination: dict):
|
|
|
134
134
|
class IterativeStratification(_BaseKFold):
|
|
135
135
|
"""Iteratively stratify a multi-label data set into folds
|
|
136
136
|
|
|
137
|
-
Construct an
|
|
137
|
+
Construct an iterative stratifier that splits the data set into folds trying to maintain balanced representation
|
|
138
138
|
with respect to order-th label combinations.
|
|
139
139
|
"""
|
|
140
140
|
|
mteb/abstasks/abstask.py
CHANGED
|
@@ -459,7 +459,7 @@ class AbsTask(ABC):
|
|
|
459
459
|
"""Filter the languages of the task.
|
|
460
460
|
|
|
461
461
|
Args:
|
|
462
|
-
languages: list of languages to filter the task by can be either a 3-letter
|
|
462
|
+
languages: list of languages to filter the task by can be either a 3-letter language code (e.g. "eng") or also include the script
|
|
463
463
|
(e.g. "eng-Latn")
|
|
464
464
|
script: A list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included.
|
|
465
465
|
If the language code does not specify the script the intersection of the language and script will be used.
|
|
@@ -491,6 +491,11 @@ class AbsTask(ABC):
|
|
|
491
491
|
if lang_scripts.contains_languages(langs):
|
|
492
492
|
subsets_to_keep.append(hf_subset)
|
|
493
493
|
|
|
494
|
+
if len(subsets_to_keep) == 0:
|
|
495
|
+
raise ValueError(
|
|
496
|
+
f"No subsets were found for {self.metadata.name} with filters: language code {languages}, script {script}, hf subsets {hf_subsets}."
|
|
497
|
+
)
|
|
498
|
+
|
|
494
499
|
self.hf_subsets = subsets_to_keep
|
|
495
500
|
return self
|
|
496
501
|
|
mteb/abstasks/clustering.py
CHANGED
|
@@ -112,7 +112,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
112
112
|
unique_train_dataset,
|
|
113
113
|
self.metadata,
|
|
114
114
|
input_column=self.input_column_name,
|
|
115
|
-
|
|
115
|
+
**encode_kwargs,
|
|
116
116
|
)
|
|
117
117
|
|
|
118
118
|
logger.info("Running multilabel classification - Encoding training set...")
|
|
@@ -141,7 +141,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
141
141
|
test_dataset.select_columns(self.input_column_name),
|
|
142
142
|
self.metadata,
|
|
143
143
|
input_column=self.input_column_name,
|
|
144
|
-
|
|
144
|
+
**encode_kwargs,
|
|
145
145
|
)
|
|
146
146
|
|
|
147
147
|
logger.info("Running multilabel classification - Encoding test set...")
|
mteb/abstasks/retrieval.py
CHANGED
|
@@ -653,6 +653,8 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
653
653
|
FileNotFoundError: If the specified path does not exist.
|
|
654
654
|
ValueError: If the loaded top ranked results are not in the expected format.
|
|
655
655
|
"""
|
|
656
|
+
self._top_k = top_k
|
|
657
|
+
|
|
656
658
|
top_ranked_path = Path(top_ranked_path)
|
|
657
659
|
if top_ranked_path.is_dir():
|
|
658
660
|
top_ranked_path = self._predictions_path(top_ranked_path)
|
|
@@ -682,7 +684,6 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
682
684
|
top_k_sorted[query_id] = sorted_keys[: self._top_k]
|
|
683
685
|
|
|
684
686
|
self.dataset[subset][split]["top_ranked"] = top_k_sorted
|
|
685
|
-
self._top_k = top_k
|
|
686
687
|
return self
|
|
687
688
|
|
|
688
689
|
|
mteb/abstasks/task_metadata.py
CHANGED
|
@@ -107,6 +107,7 @@ The domains follow the categories used in the [Universal Dependencies project](h
|
|
|
107
107
|
SampleCreationMethod = Literal[
|
|
108
108
|
"found",
|
|
109
109
|
"created",
|
|
110
|
+
"created and machine-translated",
|
|
110
111
|
"human-translated and localized",
|
|
111
112
|
"human-translated",
|
|
112
113
|
"machine-translated",
|
|
@@ -532,7 +533,7 @@ class TaskMetadata(BaseModel):
|
|
|
532
533
|
citation=self.bibtex_citation,
|
|
533
534
|
dataset_description=self.description,
|
|
534
535
|
dataset_reference=self.reference,
|
|
535
|
-
|
|
536
|
+
descriptive_stats=descriptive_stats,
|
|
536
537
|
dataset_task_name=self.name,
|
|
537
538
|
category=self.category,
|
|
538
539
|
domains=", ".join(self.domains) if self.domains else None,
|
mteb/benchmarks/_create_table.py
CHANGED
|
@@ -358,9 +358,7 @@ def _create_summary_table_mean_public_private(
|
|
|
358
358
|
"mean(public)": "Mean (Public)",
|
|
359
359
|
"mean(private)": "Mean (Private)",
|
|
360
360
|
}
|
|
361
|
-
|
|
362
|
-
if "Retrieval" in joint_table.columns:
|
|
363
|
-
rename_dict["Retrieval"] = "Mean (Task)"
|
|
361
|
+
|
|
364
362
|
joint_table = joint_table.rename(columns=rename_dict)
|
|
365
363
|
|
|
366
364
|
# Move borda rank to front
|
mteb/benchmarks/benchmark.py
CHANGED
|
@@ -87,7 +87,10 @@ class RtebBenchmark(Benchmark):
|
|
|
87
87
|
def _create_summary_table(
|
|
88
88
|
self, benchmark_results: BenchmarkResults
|
|
89
89
|
) -> pd.DataFrame:
|
|
90
|
-
|
|
90
|
+
joint_table = _create_summary_table_mean_public_private(benchmark_results)
|
|
91
|
+
# For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
|
|
92
|
+
joint_table = joint_table.rename(columns={"Retrieval": "Mean (Task)"})
|
|
93
|
+
return joint_table
|
|
91
94
|
|
|
92
95
|
|
|
93
96
|
class HUMEBenchmark(Benchmark):
|
|
@@ -106,3 +109,17 @@ class MIEBBenchmark(Benchmark):
|
|
|
106
109
|
self, benchmark_results: BenchmarkResults
|
|
107
110
|
) -> pd.DataFrame:
|
|
108
111
|
return _create_summary_table_mean_task_type(benchmark_results)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class VidoreBenchmark(Benchmark):
|
|
115
|
+
"""Wrapper for Vidore3 benchmark."""
|
|
116
|
+
|
|
117
|
+
def _create_summary_table(
|
|
118
|
+
self, benchmark_results: BenchmarkResults
|
|
119
|
+
) -> pd.DataFrame:
|
|
120
|
+
joint_table = _create_summary_table_mean_public_private(benchmark_results)
|
|
121
|
+
# For ViDoRe (V1, V2, V3): all tasks are Document Understanding type, so Document Understanding column = Mean (Task)
|
|
122
|
+
joint_table = joint_table.rename(
|
|
123
|
+
columns={"Document Understanding": "Mean (Task)"}
|
|
124
|
+
)
|
|
125
|
+
return joint_table
|
|
@@ -27,6 +27,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
|
|
|
27
27
|
MTEB_KOR,
|
|
28
28
|
MTEB_MAIN_RU,
|
|
29
29
|
MTEB_MINERS_BITEXT_MINING,
|
|
30
|
+
MTEB_NL,
|
|
30
31
|
MTEB_POL,
|
|
31
32
|
MTEB_RETRIEVAL_LAW,
|
|
32
33
|
MTEB_RETRIEVAL_MEDICAL,
|
|
@@ -37,6 +38,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
|
|
|
37
38
|
SEB,
|
|
38
39
|
VIDORE,
|
|
39
40
|
VIDORE_V2,
|
|
41
|
+
VIDORE_V3,
|
|
40
42
|
VISUAL_DOCUMENT_RETRIEVAL,
|
|
41
43
|
VN_MTEB,
|
|
42
44
|
CoIR,
|
|
@@ -87,6 +89,7 @@ __all__ = [
|
|
|
87
89
|
"MTEB_KOR",
|
|
88
90
|
"MTEB_MAIN_RU",
|
|
89
91
|
"MTEB_MINERS_BITEXT_MINING",
|
|
92
|
+
"MTEB_NL",
|
|
90
93
|
"MTEB_POL",
|
|
91
94
|
"MTEB_RETRIEVAL_LAW",
|
|
92
95
|
"MTEB_RETRIEVAL_MEDICAL",
|
|
@@ -106,6 +109,7 @@ __all__ = [
|
|
|
106
109
|
"SEB",
|
|
107
110
|
"VIDORE",
|
|
108
111
|
"VIDORE_V2",
|
|
112
|
+
"VIDORE_V3",
|
|
109
113
|
"VISUAL_DOCUMENT_RETRIEVAL",
|
|
110
114
|
"VN_MTEB",
|
|
111
115
|
"CoIR",
|