mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +6 -0
- mteb/_create_dataloaders.py +22 -20
- mteb/_evaluators/any_sts_evaluator.py +23 -14
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +3 -3
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
- mteb/_evaluators/pair_classification_evaluator.py +34 -40
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +25 -37
- mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
- mteb/_evaluators/text/summarization_evaluator.py +27 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +105 -0
- mteb/abstasks/_statistics_calculation.py +23 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -12
- mteb/abstasks/clustering.py +20 -16
- mteb/abstasks/clustering_legacy.py +13 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +33 -22
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +14 -4
- mteb/abstasks/task_metadata.py +32 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +77 -16
- mteb/benchmarks/benchmarks/__init__.py +12 -0
- mteb/benchmarks/benchmarks/benchmarks.py +361 -16
- mteb/benchmarks/get_benchmark.py +14 -53
- mteb/cache.py +227 -37
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +71 -62
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +106 -75
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +414 -151
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/load_results.py +12 -12
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +31 -23
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +3 -3
- mteb/models/get_model_meta.py +25 -118
- mteb/models/instruct_wrapper.py +33 -9
- mteb/models/model_implementations/align_models.py +8 -1
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +9 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +101 -17
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +13 -2
- mteb/models/model_implementations/blip_models.py +43 -16
- mteb/models/model_implementations/bm25.py +5 -4
- mteb/models/model_implementations/bmretriever_models.py +10 -4
- mteb/models/model_implementations/cadet_models.py +10 -1
- mteb/models/model_implementations/cde_models.py +25 -4
- mteb/models/model_implementations/clip_models.py +9 -6
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +165 -3
- mteb/models/model_implementations/codesage_models.py +18 -3
- mteb/models/model_implementations/cohere_models.py +13 -6
- mteb/models/model_implementations/cohere_v.py +7 -2
- mteb/models/model_implementations/colpali_models.py +17 -9
- mteb/models/model_implementations/colqwen_models.py +275 -5
- mteb/models/model_implementations/colsmol_models.py +4 -2
- mteb/models/model_implementations/conan_models.py +2 -1
- mteb/models/model_implementations/dino_models.py +194 -23
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +21 -110
- mteb/models/model_implementations/e5_v.py +7 -6
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +67 -9
- mteb/models/model_implementations/facebookai.py +205 -0
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +17 -10
- mteb/models/model_implementations/google_models.py +17 -6
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
- mteb/models/model_implementations/gritlm_models.py +4 -2
- mteb/models/model_implementations/gte_models.py +99 -9
- mteb/models/model_implementations/hinvec_models.py +2 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +256 -3
- mteb/models/model_implementations/jina_clip.py +49 -10
- mteb/models/model_implementations/jina_models.py +222 -11
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +37 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +4 -3
- mteb/models/model_implementations/listconranker.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +9 -6
- mteb/models/model_implementations/llm2vec_models.py +16 -8
- mteb/models/model_implementations/mcinext_models.py +7 -1
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +422 -60
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +15 -4
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +27 -14
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
- mteb/models/model_implementations/nomic_models.py +173 -6
- mteb/models/model_implementations/nomic_models_vision.py +8 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
- mteb/models/model_implementations/nvidia_models.py +155 -20
- mteb/models/model_implementations/octen_models.py +254 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +37 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
- mteb/models/model_implementations/ops_moa_models.py +5 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +9 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +12 -8
- mteb/models/model_implementations/pylate_models.py +46 -12
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +9 -6
- mteb/models/model_implementations/qzhou_models.py +5 -3
- mteb/models/model_implementations/random_baseline.py +19 -24
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +2 -1
- mteb/models/model_implementations/repllama_models.py +5 -3
- mteb/models/model_implementations/rerankers_custom.py +15 -9
- mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +71 -20
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +6 -3
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +177 -18
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +30 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +11 -1
- mteb/models/model_implementations/uae_models.py +8 -1
- mteb/models/model_implementations/vdr_models.py +3 -1
- mteb/models/model_implementations/vi_vn_models.py +45 -6
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +5 -3
- mteb/models/model_implementations/voyage_models.py +99 -0
- mteb/models/model_implementations/voyage_v.py +17 -9
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +498 -29
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
- mteb/models/search_wrappers.py +197 -65
- mteb/models/sentence_transformer_wrapper.py +52 -32
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +114 -65
- mteb/results/model_result.py +63 -26
- mteb/results/task_result.py +117 -77
- mteb/similarity_functions.py +60 -7
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -3
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +2 -3
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +16 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +24 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +19 -2
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
- mteb/models/model_implementations/mxbai_models.py +0 -102
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import hashlib
|
|
2
4
|
from collections import Counter
|
|
3
|
-
|
|
4
|
-
from
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from typing import TYPE_CHECKING, cast
|
|
5
7
|
|
|
6
8
|
from mteb.types import TopRankedDocumentsType
|
|
7
9
|
from mteb.types.statistics import (
|
|
@@ -13,6 +15,9 @@ from mteb.types.statistics import (
|
|
|
13
15
|
TopRankedStatistics,
|
|
14
16
|
)
|
|
15
17
|
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from PIL import Image
|
|
20
|
+
|
|
16
21
|
|
|
17
22
|
def calculate_text_statistics(texts: list[str]) -> TextStatistics:
|
|
18
23
|
"""Calculate descriptive statistics for a list of texts.
|
|
@@ -48,7 +53,7 @@ def calculate_image_statistics(images: list[Image.Image]) -> ImageStatistics:
|
|
|
48
53
|
seen_hashes: set[str] = set()
|
|
49
54
|
|
|
50
55
|
for img in images:
|
|
51
|
-
width, height = img.size
|
|
56
|
+
width, height = img.size
|
|
52
57
|
img_heights.append(height)
|
|
53
58
|
img_widths.append(width)
|
|
54
59
|
|
|
@@ -78,17 +83,24 @@ def calculate_label_statistics(labels: list[int | list[int]]) -> LabelStatistics
|
|
|
78
83
|
LabelStatistics: A dictionary containing the descriptive statistics.
|
|
79
84
|
|
|
80
85
|
"""
|
|
86
|
+
total_labels: list[int | None] = []
|
|
87
|
+
|
|
81
88
|
if not isinstance(labels[0], list):
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
89
|
+
# single label classification
|
|
90
|
+
single_label = cast(list[int], labels)
|
|
91
|
+
label_len = [1] * len(single_label)
|
|
92
|
+
total_label_len = len(single_label)
|
|
93
|
+
total_labels.extend(single_label)
|
|
85
94
|
elif isinstance(labels[0], list):
|
|
86
95
|
# multilabel classification
|
|
87
|
-
|
|
96
|
+
multilabel_labels = cast(list[list[int]], labels)
|
|
97
|
+
label_len = [len(l) for l in multilabel_labels]
|
|
88
98
|
total_label_len = sum(label_len)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
99
|
+
for l in multilabel_labels:
|
|
100
|
+
if l and len(l) > 0:
|
|
101
|
+
total_labels.extend(l)
|
|
102
|
+
else:
|
|
103
|
+
total_labels.append(None)
|
|
92
104
|
else:
|
|
93
105
|
raise ValueError(
|
|
94
106
|
"Labels must be a list of integers or a list of lists of integers."
|
|
@@ -155,7 +167,7 @@ def calculate_top_ranked_statistics(
|
|
|
155
167
|
|
|
156
168
|
|
|
157
169
|
def calculate_relevant_docs_statistics(
|
|
158
|
-
relevant_docs:
|
|
170
|
+
relevant_docs: Mapping[str, Mapping[str, int]],
|
|
159
171
|
) -> RelevantDocsStatistics:
|
|
160
172
|
qrels_lengths = [len(relevant_docs[qid]) for qid in relevant_docs]
|
|
161
173
|
unique_qrels = len({doc for qid in relevant_docs for doc in relevant_docs[qid]})
|
mteb/abstasks/_stratification.py
CHANGED
|
@@ -39,6 +39,7 @@ Bibtex:
|
|
|
39
39
|
"""
|
|
40
40
|
|
|
41
41
|
import itertools
|
|
42
|
+
from typing import Any
|
|
42
43
|
|
|
43
44
|
import numpy as np
|
|
44
45
|
import scipy.sparse as sp
|
|
@@ -119,8 +120,10 @@ def _get_most_desired_combination(samples_with_combination: dict):
|
|
|
119
120
|
if support_size == 0:
|
|
120
121
|
continue
|
|
121
122
|
if currently_chosen is None or (
|
|
122
|
-
best_number_of_combinations
|
|
123
|
-
and best_support_size
|
|
123
|
+
best_number_of_combinations is not None
|
|
124
|
+
and best_support_size is not None
|
|
125
|
+
and best_number_of_combinations < number_of_combinations
|
|
126
|
+
and best_support_size > support_size
|
|
124
127
|
):
|
|
125
128
|
currently_chosen = combination
|
|
126
129
|
best_number_of_combinations, best_support_size = (
|
|
@@ -162,7 +165,7 @@ class IterativeStratification(_BaseKFold):
|
|
|
162
165
|
self._rng_state = check_random_state(random_state)
|
|
163
166
|
need_shuffle = shuffle or random_state is not None
|
|
164
167
|
self.order = order
|
|
165
|
-
super().__init__(
|
|
168
|
+
super().__init__(
|
|
166
169
|
n_splits,
|
|
167
170
|
shuffle=need_shuffle,
|
|
168
171
|
random_state=self._rng_state if need_shuffle else None,
|
|
@@ -172,8 +175,7 @@ class IterativeStratification(_BaseKFold):
|
|
|
172
175
|
self.percentage_per_fold = sample_distribution_per_fold
|
|
173
176
|
else:
|
|
174
177
|
self.percentage_per_fold = [
|
|
175
|
-
1 / float(self.n_splits)
|
|
176
|
-
for _ in range(self.n_splits) # type: ignore
|
|
178
|
+
1 / float(self.n_splits) for _ in range(self.n_splits)
|
|
177
179
|
]
|
|
178
180
|
|
|
179
181
|
def _prepare_stratification(
|
|
@@ -182,9 +184,9 @@ class IterativeStratification(_BaseKFold):
|
|
|
182
184
|
list[list[int]],
|
|
183
185
|
dict[int, bool],
|
|
184
186
|
list[list[int]],
|
|
185
|
-
list[list[
|
|
186
|
-
dict[
|
|
187
|
-
list[list[
|
|
187
|
+
list[list[Any]],
|
|
188
|
+
dict[str, list[Any]],
|
|
189
|
+
list[list[Any]],
|
|
188
190
|
]:
|
|
189
191
|
"""Prepares variables for performing stratification
|
|
190
192
|
|
|
@@ -206,14 +208,14 @@ class IterativeStratification(_BaseKFold):
|
|
|
206
208
|
"""
|
|
207
209
|
self.n_samples, self.n_labels = y.shape
|
|
208
210
|
self.desired_samples_per_fold = np.array(
|
|
209
|
-
[self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]
|
|
211
|
+
[self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]
|
|
210
212
|
)
|
|
211
213
|
rows = sp.lil_matrix(y).rows
|
|
212
214
|
rows_used = dict.fromkeys(range(self.n_samples), False)
|
|
213
215
|
all_combinations = []
|
|
214
|
-
per_row_combinations = [[] for i in range(self.n_samples)]
|
|
215
|
-
samples_with_combination = {}
|
|
216
|
-
folds = [[] for _ in range(self.n_splits)]
|
|
216
|
+
per_row_combinations: list[list[Any]] = [[] for i in range(self.n_samples)]
|
|
217
|
+
samples_with_combination: dict[str, list[Any]] = {}
|
|
218
|
+
folds: list[list[int]] = [[] for _ in range(self.n_splits)]
|
|
217
219
|
|
|
218
220
|
# for every row
|
|
219
221
|
for sample_index, label_assignment in enumerate(rows):
|
|
@@ -229,21 +231,19 @@ class IterativeStratification(_BaseKFold):
|
|
|
229
231
|
all_combinations.append(combination)
|
|
230
232
|
per_row_combinations[sample_index].append(combination)
|
|
231
233
|
|
|
232
|
-
all_combinations = [list(x) for x in set(all_combinations)]
|
|
233
|
-
|
|
234
234
|
self.desired_samples_per_combination_per_fold = {
|
|
235
235
|
combination: np.array(
|
|
236
236
|
[
|
|
237
237
|
len(evidence_for_combination) * self.percentage_per_fold[j]
|
|
238
|
-
for j in range(self.n_splits)
|
|
238
|
+
for j in range(self.n_splits)
|
|
239
239
|
]
|
|
240
240
|
)
|
|
241
241
|
for combination, evidence_for_combination in samples_with_combination.items()
|
|
242
242
|
}
|
|
243
243
|
return (
|
|
244
|
-
rows,
|
|
244
|
+
rows.tolist(),
|
|
245
245
|
rows_used,
|
|
246
|
-
all_combinations,
|
|
246
|
+
[list(x) for x in set(all_combinations)],
|
|
247
247
|
per_row_combinations,
|
|
248
248
|
samples_with_combination,
|
|
249
249
|
folds,
|
|
@@ -328,7 +328,7 @@ class IterativeStratification(_BaseKFold):
|
|
|
328
328
|
per_row_combinations,
|
|
329
329
|
samples_with_combination,
|
|
330
330
|
folds,
|
|
331
|
-
) = self._prepare_stratification(y)
|
|
331
|
+
) = self._prepare_stratification(y)
|
|
332
332
|
|
|
333
333
|
self._distribute_positive_evidence(
|
|
334
334
|
rows_used, folds, samples_with_combination, per_row_combinations
|
mteb/abstasks/abstask.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import warnings
|
|
3
4
|
from abc import ABC, abstractmethod
|
|
4
|
-
from collections.abc import Sequence
|
|
5
|
+
from collections.abc import Mapping, Sequence
|
|
5
6
|
from copy import copy
|
|
6
7
|
from pathlib import Path
|
|
7
|
-
from typing import Any, cast
|
|
8
|
+
from typing import Any, Literal, cast
|
|
8
9
|
|
|
9
10
|
import numpy as np
|
|
10
11
|
from datasets import ClassLabel, Dataset, DatasetDict, load_dataset
|
|
@@ -22,6 +23,7 @@ from mteb.models import (
|
|
|
22
23
|
SearchProtocol,
|
|
23
24
|
)
|
|
24
25
|
from mteb.types import HFSubset, Modalities, ScoresDict
|
|
26
|
+
from mteb.types._encoder_io import EncodeKwargs
|
|
25
27
|
from mteb.types.statistics import DescriptiveStatistics, SplitDescriptiveStatistics
|
|
26
28
|
|
|
27
29
|
logger = logging.getLogger(__name__)
|
|
@@ -78,8 +80,8 @@ class AbsTask(ABC):
|
|
|
78
80
|
"""
|
|
79
81
|
|
|
80
82
|
metadata: TaskMetadata
|
|
81
|
-
abstask_prompt: str
|
|
82
|
-
_eval_splits:
|
|
83
|
+
abstask_prompt: str
|
|
84
|
+
_eval_splits: Sequence[str] | None = None
|
|
83
85
|
dataset: dict[HFSubset, DatasetDict] | None = None
|
|
84
86
|
data_loaded: bool = False
|
|
85
87
|
hf_subsets: list[HFSubset]
|
|
@@ -102,9 +104,9 @@ class AbsTask(ABC):
|
|
|
102
104
|
def check_if_dataset_is_superseded(self) -> None:
|
|
103
105
|
"""Check if the dataset is superseded by a newer version."""
|
|
104
106
|
if self.superseded_by:
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
)
|
|
107
|
+
msg = f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}'. We recommend using the newer version of the dataset unless you are running a specific benchmark. See `get_task('{self.superseded_by}').metadata.description` to get a description of the task and changes."
|
|
108
|
+
logger.warning(msg)
|
|
109
|
+
warnings.warn(msg)
|
|
108
110
|
|
|
109
111
|
def dataset_transform(self):
|
|
110
112
|
"""A transform operations applied to the dataset after loading.
|
|
@@ -120,10 +122,10 @@ class AbsTask(ABC):
|
|
|
120
122
|
split: str = "test",
|
|
121
123
|
subsets_to_run: list[HFSubset] | None = None,
|
|
122
124
|
*,
|
|
123
|
-
encode_kwargs:
|
|
125
|
+
encode_kwargs: EncodeKwargs,
|
|
124
126
|
prediction_folder: Path | None = None,
|
|
125
127
|
**kwargs: Any,
|
|
126
|
-
) ->
|
|
128
|
+
) -> Mapping[HFSubset, ScoresDict]:
|
|
127
129
|
"""Evaluates an MTEB compatible model on the task.
|
|
128
130
|
|
|
129
131
|
Args:
|
|
@@ -195,12 +197,12 @@ class AbsTask(ABC):
|
|
|
195
197
|
@abstractmethod
|
|
196
198
|
def _evaluate_subset(
|
|
197
199
|
self,
|
|
198
|
-
model:
|
|
200
|
+
model: MTEBModels,
|
|
199
201
|
data_split: Dataset,
|
|
200
202
|
*,
|
|
201
|
-
encode_kwargs: dict[str, Any],
|
|
202
203
|
hf_split: str,
|
|
203
204
|
hf_subset: str,
|
|
205
|
+
encode_kwargs: EncodeKwargs,
|
|
204
206
|
prediction_folder: Path | None = None,
|
|
205
207
|
**kwargs: Any,
|
|
206
208
|
) -> ScoresDict:
|
|
@@ -210,7 +212,7 @@ class AbsTask(ABC):
|
|
|
210
212
|
|
|
211
213
|
def _save_task_predictions(
|
|
212
214
|
self,
|
|
213
|
-
predictions:
|
|
215
|
+
predictions: Mapping[str, Any] | list[Any],
|
|
214
216
|
model: MTEBModels,
|
|
215
217
|
prediction_folder: Path,
|
|
216
218
|
hf_split: str,
|
|
@@ -226,7 +228,7 @@ class AbsTask(ABC):
|
|
|
226
228
|
hf_subset: The subset of the dataset (e.g. "en").
|
|
227
229
|
"""
|
|
228
230
|
predictions_path = self._predictions_path(prediction_folder)
|
|
229
|
-
existing_results = {
|
|
231
|
+
existing_results: dict[str, Any] = {
|
|
230
232
|
"mteb_model_meta": {
|
|
231
233
|
"model_name": model.mteb_model_meta.name,
|
|
232
234
|
"revision": model.mteb_model_meta.revision,
|
|
@@ -326,7 +328,7 @@ class AbsTask(ABC):
|
|
|
326
328
|
)
|
|
327
329
|
else:
|
|
328
330
|
# some of monolingual datasets explicitly adding the split name to the dataset name
|
|
329
|
-
self.dataset = load_dataset(**self.metadata.dataset)
|
|
331
|
+
self.dataset = load_dataset(**self.metadata.dataset)
|
|
330
332
|
self.dataset_transform()
|
|
331
333
|
self.data_loaded = True
|
|
332
334
|
|
|
@@ -362,15 +364,19 @@ class AbsTask(ABC):
|
|
|
362
364
|
"""
|
|
363
365
|
from mteb.abstasks import AbsTaskClassification
|
|
364
366
|
|
|
365
|
-
|
|
367
|
+
existing_stats = self.metadata.descriptive_stats
|
|
368
|
+
|
|
369
|
+
if existing_stats is not None and not overwrite_results:
|
|
366
370
|
logger.info("Loading metadata descriptive statistics from cache.")
|
|
367
|
-
return
|
|
371
|
+
return existing_stats
|
|
368
372
|
|
|
369
373
|
if not self.data_loaded:
|
|
370
374
|
self.load_data()
|
|
371
375
|
|
|
372
376
|
descriptive_stats: dict[str, DescriptiveStatistics] = {}
|
|
373
|
-
hf_subset_stat
|
|
377
|
+
hf_subset_stat: Literal["hf_subset_descriptive_stats"] = (
|
|
378
|
+
"hf_subset_descriptive_stats"
|
|
379
|
+
)
|
|
374
380
|
eval_splits = self.metadata.eval_splits
|
|
375
381
|
if isinstance(self, AbsTaskClassification):
|
|
376
382
|
eval_splits.append(self.train_split)
|
|
@@ -381,7 +387,7 @@ class AbsTask(ABC):
|
|
|
381
387
|
logger.info(f"Processing metadata for split {split}")
|
|
382
388
|
if self.metadata.is_multilingual:
|
|
383
389
|
descriptive_stats[split] = (
|
|
384
|
-
self._calculate_descriptive_statistics_from_split(
|
|
390
|
+
self._calculate_descriptive_statistics_from_split( # type: ignore[assignment]
|
|
385
391
|
split, compute_overall=True
|
|
386
392
|
)
|
|
387
393
|
)
|
|
@@ -400,7 +406,7 @@ class AbsTask(ABC):
|
|
|
400
406
|
descriptive_stats[split][hf_subset_stat][hf_subset] = split_details
|
|
401
407
|
else:
|
|
402
408
|
split_details = self._calculate_descriptive_statistics_from_split(split)
|
|
403
|
-
descriptive_stats[split] = split_details
|
|
409
|
+
descriptive_stats[split] = split_details # type: ignore[assignment]
|
|
404
410
|
|
|
405
411
|
with self.metadata.descriptive_stat_path.open("w") as f:
|
|
406
412
|
json.dump(descriptive_stats, f, indent=4)
|
|
@@ -437,7 +443,7 @@ class AbsTask(ABC):
|
|
|
437
443
|
|
|
438
444
|
return self.metadata.languages
|
|
439
445
|
|
|
440
|
-
def filter_eval_splits(self, eval_splits:
|
|
446
|
+
def filter_eval_splits(self, eval_splits: Sequence[str] | None) -> Self:
|
|
441
447
|
"""Filter the evaluation splits of the task.
|
|
442
448
|
|
|
443
449
|
Args:
|
|
@@ -451,9 +457,9 @@ class AbsTask(ABC):
|
|
|
451
457
|
|
|
452
458
|
def filter_languages(
|
|
453
459
|
self,
|
|
454
|
-
languages:
|
|
455
|
-
script:
|
|
456
|
-
hf_subsets:
|
|
460
|
+
languages: Sequence[str] | None,
|
|
461
|
+
script: Sequence[str] | None = None,
|
|
462
|
+
hf_subsets: Sequence[HFSubset] | None = None,
|
|
457
463
|
exclusive_language_filter: bool = False,
|
|
458
464
|
) -> Self:
|
|
459
465
|
"""Filter the languages of the task.
|
|
@@ -499,12 +505,14 @@ class AbsTask(ABC):
|
|
|
499
505
|
self.hf_subsets = subsets_to_keep
|
|
500
506
|
return self
|
|
501
507
|
|
|
502
|
-
def _add_main_score(self, scores:
|
|
508
|
+
def _add_main_score(self, scores: ScoresDict) -> None:
|
|
503
509
|
scores["main_score"] = scores[self.metadata.main_score]
|
|
504
510
|
|
|
505
511
|
def _upload_dataset_to_hub(
|
|
506
512
|
self, repo_name: str, fields: list[str] | dict[str, str]
|
|
507
513
|
) -> None:
|
|
514
|
+
if self.dataset is None:
|
|
515
|
+
raise ValueError("Dataset not loaded")
|
|
508
516
|
if self.metadata.is_multilingual:
|
|
509
517
|
for config in self.metadata.eval_langs:
|
|
510
518
|
logger.info(f"Converting {config} of {self.metadata.name}")
|
|
@@ -574,7 +582,7 @@ class AbsTask(ABC):
|
|
|
574
582
|
return False
|
|
575
583
|
|
|
576
584
|
@property
|
|
577
|
-
def eval_splits(self) ->
|
|
585
|
+
def eval_splits(self) -> Sequence[str]:
|
|
578
586
|
"""Returns the evaluation splits of the task."""
|
|
579
587
|
if self._eval_splits:
|
|
580
588
|
return self._eval_splits
|
|
@@ -607,9 +615,8 @@ class AbsTask(ABC):
|
|
|
607
615
|
self.data_loaded = False
|
|
608
616
|
logger.info(f"Unloaded dataset {self.metadata.name} from memory.")
|
|
609
617
|
else:
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
)
|
|
618
|
+
msg = f"Dataset `{self.metadata.name}` is not loaded, cannot unload it."
|
|
619
|
+
logger.warning(msg)
|
|
613
620
|
|
|
614
621
|
@property
|
|
615
622
|
def superseded_by(self) -> str | None:
|
|
@@ -5,7 +5,6 @@ from pydantic import ConfigDict, Field, model_validator
|
|
|
5
5
|
from typing_extensions import Self
|
|
6
6
|
|
|
7
7
|
from mteb.types import (
|
|
8
|
-
HFSubset,
|
|
9
8
|
ISOLanguageScript,
|
|
10
9
|
Languages,
|
|
11
10
|
Licenses,
|
|
@@ -60,14 +59,7 @@ class AggregateTaskMetadata(TaskMetadata):
|
|
|
60
59
|
reference: str | None = None
|
|
61
60
|
bibtex_citation: str | None = None
|
|
62
61
|
|
|
63
|
-
@
|
|
64
|
-
def hf_subsets_to_langscripts(self) -> dict[HFSubset, list[ISOLanguageScript]]:
|
|
65
|
-
"""Return a dictionary mapping huggingface subsets to languages."""
|
|
66
|
-
if isinstance(self.eval_langs, dict):
|
|
67
|
-
return self.eval_langs
|
|
68
|
-
return {"default": self.eval_langs} # type: ignore
|
|
69
|
-
|
|
70
|
-
@model_validator(mode="after") # type: ignore
|
|
62
|
+
@model_validator(mode="after")
|
|
71
63
|
def _compute_unfilled_cases(self) -> Self:
|
|
72
64
|
if not self.eval_langs:
|
|
73
65
|
self.eval_langs = self._compute_eval_langs()
|
mteb/abstasks/aggregated_task.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import warnings
|
|
3
|
+
from collections.abc import Mapping
|
|
2
4
|
from pathlib import Path
|
|
3
5
|
from typing import Any
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
from datasets import Dataset, DatasetDict
|
|
7
|
-
from typing_extensions import Self
|
|
8
9
|
|
|
9
10
|
from mteb.models.models_protocols import MTEBModels
|
|
10
11
|
from mteb.results.task_result import TaskResult
|
|
11
|
-
from mteb.types import HFSubset, ScoresDict
|
|
12
|
+
from mteb.types import EncodeKwargs, HFSubset, ScoresDict
|
|
12
13
|
from mteb.types.statistics import DescriptiveStatistics
|
|
13
14
|
|
|
14
15
|
from .abstask import AbsTask
|
|
@@ -32,7 +33,7 @@ class AbsTaskAggregate(AbsTask):
|
|
|
32
33
|
|
|
33
34
|
def task_results_to_scores(
|
|
34
35
|
self, task_results: list[TaskResult]
|
|
35
|
-
) -> dict[str,
|
|
36
|
+
) -> dict[str, Mapping[HFSubset, ScoresDict]]:
|
|
36
37
|
"""The function that aggregated scores. Can be redefined to allow for custom aggregations.
|
|
37
38
|
|
|
38
39
|
Args:
|
|
@@ -41,7 +42,7 @@ class AbsTaskAggregate(AbsTask):
|
|
|
41
42
|
Returns:
|
|
42
43
|
A dictionary with the aggregated scores.
|
|
43
44
|
"""
|
|
44
|
-
scores = {}
|
|
45
|
+
scores: dict[str, Mapping[HFSubset, ScoresDict]] = {}
|
|
45
46
|
subsets = (
|
|
46
47
|
self.metadata.eval_langs.keys()
|
|
47
48
|
if isinstance(self.metadata.eval_langs, dict)
|
|
@@ -113,40 +114,20 @@ class AbsTaskAggregate(AbsTask):
|
|
|
113
114
|
)
|
|
114
115
|
mteb_versions = {tr.mteb_version for tr in task_results}
|
|
115
116
|
if len(mteb_versions) != 1:
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
)
|
|
117
|
+
msg = f"All tasks of {self.metadata.name} is not run using the same version. different versions found are: {mteb_versions}"
|
|
118
|
+
logger.warning(msg)
|
|
119
|
+
warnings.warn(msg)
|
|
119
120
|
task_res.mteb_version = None
|
|
120
121
|
task_res.mteb_version = task_results[0].mteb_version
|
|
121
122
|
return task_res
|
|
122
123
|
|
|
123
|
-
def check_if_dataset_is_superseded(self) -> None:
|
|
124
|
-
"""Check if the dataset is superseded by a newer version"""
|
|
125
|
-
if self.superseded_by:
|
|
126
|
-
logger.warning(
|
|
127
|
-
f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}', you might consider using the newer version of the dataset."
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
def filter_eval_splits(self, eval_splits: list[str] | None) -> Self:
|
|
131
|
-
"""Filter the evaluation splits of the task.
|
|
132
|
-
|
|
133
|
-
Args:
|
|
134
|
-
eval_splits: List of splits to evaluate on. If None, all splits in metadata
|
|
135
|
-
are used.
|
|
136
|
-
|
|
137
|
-
Returns:
|
|
138
|
-
The task with filtered evaluation splits.
|
|
139
|
-
"""
|
|
140
|
-
self._eval_splits = eval_splits
|
|
141
|
-
return self
|
|
142
|
-
|
|
143
124
|
def evaluate(
|
|
144
125
|
self,
|
|
145
126
|
model: MTEBModels,
|
|
146
127
|
split: str = "test",
|
|
147
128
|
subsets_to_run: list[HFSubset] | None = None,
|
|
148
129
|
*,
|
|
149
|
-
encode_kwargs:
|
|
130
|
+
encode_kwargs: EncodeKwargs,
|
|
150
131
|
prediction_folder: Path | None = None,
|
|
151
132
|
**kwargs: Any,
|
|
152
133
|
) -> dict[HFSubset, ScoresDict]:
|
|
@@ -160,7 +141,7 @@ class AbsTaskAggregate(AbsTask):
|
|
|
160
141
|
self,
|
|
161
142
|
model: MTEBModels,
|
|
162
143
|
data_split: DatasetDict | Dataset,
|
|
163
|
-
encode_kwargs:
|
|
144
|
+
encode_kwargs: EncodeKwargs,
|
|
164
145
|
**kwargs: Any,
|
|
165
146
|
) -> ScoresDict:
|
|
166
147
|
raise NotImplementedError(
|
mteb/abstasks/classification.py
CHANGED
|
@@ -5,7 +5,6 @@ from typing import Any, TypedDict
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
from datasets import Dataset, DatasetDict
|
|
8
|
-
from PIL import ImageFile
|
|
9
8
|
from sklearn.linear_model import LogisticRegression
|
|
10
9
|
from sklearn.metrics import (
|
|
11
10
|
accuracy_score,
|
|
@@ -17,7 +16,7 @@ from sklearn.metrics import (
|
|
|
17
16
|
|
|
18
17
|
from mteb._evaluators.sklearn_evaluator import SklearnEvaluator, SklearnModelProtocol
|
|
19
18
|
from mteb.models import EncoderProtocol, MTEBModels
|
|
20
|
-
from mteb.types import HFSubset, ScoresDict
|
|
19
|
+
from mteb.types import EncodeKwargs, HFSubset, ScoresDict
|
|
21
20
|
from mteb.types.statistics import (
|
|
22
21
|
ImageStatistics,
|
|
23
22
|
LabelStatistics,
|
|
@@ -32,7 +31,6 @@ from ._statistics_calculation import (
|
|
|
32
31
|
)
|
|
33
32
|
from .abstask import AbsTask
|
|
34
33
|
|
|
35
|
-
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
|
36
34
|
logger = logging.getLogger(__name__)
|
|
37
35
|
|
|
38
36
|
|
|
@@ -100,9 +98,8 @@ class AbsTaskClassification(AbsTask):
|
|
|
100
98
|
text: str (for text) or PIL.Image (for image). Column name can be changed via `input_column_name` attribute.
|
|
101
99
|
label: int. Column name can be changed via `label_column_name` attribute.
|
|
102
100
|
evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LogisticRegression`.
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
n_experiments: Number of experiments to run. Default is 10.
|
|
101
|
+
samples_per_label: Number of samples per label to use for training the evaluator model. Default is 8.
|
|
102
|
+
n_experiments: Number of experiments to run. Default is 10.
|
|
106
103
|
train_split: Name of the split to use for training the evaluator model. Default is "train".
|
|
107
104
|
label_column_name: Name of the column containing the labels. Default is "label".
|
|
108
105
|
input_column_name: Name of the column containing the input data. Default is "text".
|
|
@@ -128,7 +125,7 @@ class AbsTaskClassification(AbsTask):
|
|
|
128
125
|
split: str = "test",
|
|
129
126
|
subsets_to_run: list[HFSubset] | None = None,
|
|
130
127
|
*,
|
|
131
|
-
encode_kwargs:
|
|
128
|
+
encode_kwargs: EncodeKwargs,
|
|
132
129
|
prediction_folder: Path | None = None,
|
|
133
130
|
**kwargs: Any,
|
|
134
131
|
) -> dict[HFSubset, ScoresDict]:
|
|
@@ -145,6 +142,9 @@ class AbsTaskClassification(AbsTask):
|
|
|
145
142
|
if not self.data_loaded:
|
|
146
143
|
self.load_data()
|
|
147
144
|
|
|
145
|
+
if self.dataset is None:
|
|
146
|
+
raise RuntimeError("Dataset not loaded.")
|
|
147
|
+
|
|
148
148
|
if "random_state" in self.evaluator_model.get_params():
|
|
149
149
|
self.evaluator_model = self.evaluator_model.set_params(
|
|
150
150
|
random_state=self.seed
|
|
@@ -177,19 +177,22 @@ class AbsTaskClassification(AbsTask):
|
|
|
177
177
|
)
|
|
178
178
|
self._add_main_score(scores[hf_subset])
|
|
179
179
|
|
|
180
|
-
return scores
|
|
180
|
+
return scores # type: ignore[return-value]
|
|
181
181
|
|
|
182
182
|
def _evaluate_subset(
|
|
183
183
|
self,
|
|
184
|
-
model:
|
|
184
|
+
model: MTEBModels,
|
|
185
185
|
data_split: DatasetDict,
|
|
186
186
|
*,
|
|
187
|
-
encode_kwargs:
|
|
187
|
+
encode_kwargs: EncodeKwargs,
|
|
188
188
|
hf_split: str,
|
|
189
189
|
hf_subset: str,
|
|
190
190
|
prediction_folder: Path | None = None,
|
|
191
191
|
**kwargs: Any,
|
|
192
192
|
) -> FullClassificationMetrics:
|
|
193
|
+
if not isinstance(model, EncoderProtocol):
|
|
194
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
195
|
+
|
|
193
196
|
train_split = data_split[self.train_split]
|
|
194
197
|
eval_split = data_split[hf_split]
|
|
195
198
|
|
|
@@ -239,7 +242,7 @@ class AbsTaskClassification(AbsTask):
|
|
|
239
242
|
# ap will be none for non binary classification tasks
|
|
240
243
|
k: (
|
|
241
244
|
float(np.mean(values))
|
|
242
|
-
if (values := [s[k] for s in scores if s[k] is not None])
|
|
245
|
+
if (values := [s[k] for s in scores if s[k] is not None]) # type: ignore[literal-required]
|
|
243
246
|
else np.nan
|
|
244
247
|
)
|
|
245
248
|
for k in scores[0].keys()
|
|
@@ -247,7 +250,7 @@ class AbsTaskClassification(AbsTask):
|
|
|
247
250
|
logger.info(f"Running {self.metadata.name} - Finished.")
|
|
248
251
|
return FullClassificationMetrics(
|
|
249
252
|
scores_per_experiment=scores,
|
|
250
|
-
**avg_scores,
|
|
253
|
+
**avg_scores, # type: ignore[typeddict-item]
|
|
251
254
|
)
|
|
252
255
|
|
|
253
256
|
def _calculate_scores(
|