mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +6 -0
- mteb/_create_dataloaders.py +22 -20
- mteb/_evaluators/any_sts_evaluator.py +23 -14
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +3 -3
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
- mteb/_evaluators/pair_classification_evaluator.py +34 -40
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +25 -37
- mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
- mteb/_evaluators/text/summarization_evaluator.py +27 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +105 -0
- mteb/abstasks/_statistics_calculation.py +23 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -12
- mteb/abstasks/clustering.py +20 -16
- mteb/abstasks/clustering_legacy.py +13 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +33 -22
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +14 -4
- mteb/abstasks/task_metadata.py +32 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +77 -16
- mteb/benchmarks/benchmarks/__init__.py +12 -0
- mteb/benchmarks/benchmarks/benchmarks.py +361 -16
- mteb/benchmarks/get_benchmark.py +14 -53
- mteb/cache.py +227 -37
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +71 -62
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +106 -75
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +414 -151
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/load_results.py +12 -12
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +31 -23
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +3 -3
- mteb/models/get_model_meta.py +25 -118
- mteb/models/instruct_wrapper.py +33 -9
- mteb/models/model_implementations/align_models.py +8 -1
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +9 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +101 -17
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +13 -2
- mteb/models/model_implementations/blip_models.py +43 -16
- mteb/models/model_implementations/bm25.py +5 -4
- mteb/models/model_implementations/bmretriever_models.py +10 -4
- mteb/models/model_implementations/cadet_models.py +10 -1
- mteb/models/model_implementations/cde_models.py +25 -4
- mteb/models/model_implementations/clip_models.py +9 -6
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +165 -3
- mteb/models/model_implementations/codesage_models.py +18 -3
- mteb/models/model_implementations/cohere_models.py +13 -6
- mteb/models/model_implementations/cohere_v.py +7 -2
- mteb/models/model_implementations/colpali_models.py +17 -9
- mteb/models/model_implementations/colqwen_models.py +275 -5
- mteb/models/model_implementations/colsmol_models.py +4 -2
- mteb/models/model_implementations/conan_models.py +2 -1
- mteb/models/model_implementations/dino_models.py +194 -23
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +21 -110
- mteb/models/model_implementations/e5_v.py +7 -6
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +67 -9
- mteb/models/model_implementations/facebookai.py +205 -0
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +17 -10
- mteb/models/model_implementations/google_models.py +17 -6
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
- mteb/models/model_implementations/gritlm_models.py +4 -2
- mteb/models/model_implementations/gte_models.py +99 -9
- mteb/models/model_implementations/hinvec_models.py +2 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +256 -3
- mteb/models/model_implementations/jina_clip.py +49 -10
- mteb/models/model_implementations/jina_models.py +222 -11
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +37 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +4 -3
- mteb/models/model_implementations/listconranker.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +9 -6
- mteb/models/model_implementations/llm2vec_models.py +16 -8
- mteb/models/model_implementations/mcinext_models.py +7 -1
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +422 -60
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +15 -4
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +27 -14
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
- mteb/models/model_implementations/nomic_models.py +173 -6
- mteb/models/model_implementations/nomic_models_vision.py +8 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
- mteb/models/model_implementations/nvidia_models.py +155 -20
- mteb/models/model_implementations/octen_models.py +254 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +37 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
- mteb/models/model_implementations/ops_moa_models.py +5 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +9 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +12 -8
- mteb/models/model_implementations/pylate_models.py +46 -12
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +9 -6
- mteb/models/model_implementations/qzhou_models.py +5 -3
- mteb/models/model_implementations/random_baseline.py +19 -24
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +2 -1
- mteb/models/model_implementations/repllama_models.py +5 -3
- mteb/models/model_implementations/rerankers_custom.py +15 -9
- mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +71 -20
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +6 -3
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +177 -18
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +30 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +11 -1
- mteb/models/model_implementations/uae_models.py +8 -1
- mteb/models/model_implementations/vdr_models.py +3 -1
- mteb/models/model_implementations/vi_vn_models.py +45 -6
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +5 -3
- mteb/models/model_implementations/voyage_models.py +99 -0
- mteb/models/model_implementations/voyage_v.py +17 -9
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +498 -29
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
- mteb/models/search_wrappers.py +197 -65
- mteb/models/sentence_transformer_wrapper.py +52 -32
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +114 -65
- mteb/results/model_result.py +63 -26
- mteb/results/task_result.py +117 -77
- mteb/similarity_functions.py +60 -7
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -3
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +2 -3
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +16 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +24 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +19 -2
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
- mteb/models/model_implementations/mxbai_models.py +0 -102
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
mteb/abstasks/clustering.py
CHANGED
|
@@ -3,7 +3,7 @@ import logging
|
|
|
3
3
|
import random
|
|
4
4
|
from collections import defaultdict
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, cast
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
from datasets import Dataset, DatasetDict
|
|
@@ -11,8 +11,8 @@ from sklearn.cluster import MiniBatchKMeans
|
|
|
11
11
|
from sklearn.metrics.cluster import v_measure_score
|
|
12
12
|
|
|
13
13
|
from mteb._create_dataloaders import create_dataloader
|
|
14
|
-
from mteb.models import EncoderProtocol
|
|
15
|
-
from mteb.types import HFSubset, ScoresDict
|
|
14
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
15
|
+
from mteb.types import Array, EncodeKwargs, HFSubset, ScoresDict
|
|
16
16
|
from mteb.types.statistics import (
|
|
17
17
|
ImageStatistics,
|
|
18
18
|
LabelStatistics,
|
|
@@ -34,7 +34,7 @@ MultilingualDataset = dict[HFSubset, DatasetDict]
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def _evaluate_clustering_bootstrapped(
|
|
37
|
-
embeddings:
|
|
37
|
+
embeddings: Array,
|
|
38
38
|
labels: list[list[str]],
|
|
39
39
|
n_clusters: int,
|
|
40
40
|
cluster_size: int,
|
|
@@ -61,21 +61,21 @@ def _evaluate_clustering_bootstrapped(
|
|
|
61
61
|
max_depth = max(map(len, labels))
|
|
62
62
|
# Evaluate on each level til max depth
|
|
63
63
|
for i_level in range(max_depth):
|
|
64
|
-
level_labels = []
|
|
64
|
+
level_labels: list[str | int] = []
|
|
65
65
|
# Assign -1 to gold label if the level is not there
|
|
66
66
|
for label in labels:
|
|
67
67
|
if len(label) > i_level:
|
|
68
68
|
level_labels.append(label[i_level])
|
|
69
69
|
else:
|
|
70
70
|
level_labels.append(-1)
|
|
71
|
-
|
|
71
|
+
np_level_labels = np.array(level_labels)
|
|
72
72
|
valid_idx = np.array(
|
|
73
|
-
[level_label != -1 for level_label in
|
|
73
|
+
[level_label != -1 for level_label in np_level_labels]
|
|
74
74
|
) # Could be level_labels != -1 but fails with FutureWarning: elementwise comparison failed
|
|
75
|
-
|
|
75
|
+
np_level_labels = np_level_labels[valid_idx]
|
|
76
76
|
level_embeddings = embeddings[valid_idx]
|
|
77
77
|
clustering_model = MiniBatchKMeans(
|
|
78
|
-
n_clusters=np.unique(
|
|
78
|
+
n_clusters=np.unique(np_level_labels).size,
|
|
79
79
|
batch_size=kmean_batch_size,
|
|
80
80
|
init="k-means++",
|
|
81
81
|
n_init=1, # default when kmeans++ is used
|
|
@@ -87,7 +87,7 @@ def _evaluate_clustering_bootstrapped(
|
|
|
87
87
|
cluster_indices = rng_state.choices(range(n_embeddings), k=cluster_size)
|
|
88
88
|
|
|
89
89
|
_embeddings = level_embeddings[cluster_indices]
|
|
90
|
-
_labels =
|
|
90
|
+
_labels = np_level_labels[cluster_indices]
|
|
91
91
|
cluster_assignment = clustering_model.fit_predict(_embeddings)
|
|
92
92
|
v_measure = v_measure_score(_labels, cluster_assignment)
|
|
93
93
|
v_measures[f"Level {i_level}"].append(v_measure)
|
|
@@ -153,15 +153,19 @@ class AbsTaskClustering(AbsTask):
|
|
|
153
153
|
|
|
154
154
|
def _evaluate_subset(
|
|
155
155
|
self,
|
|
156
|
-
model:
|
|
156
|
+
model: MTEBModels,
|
|
157
157
|
data_split: Dataset,
|
|
158
158
|
*,
|
|
159
|
-
encode_kwargs:
|
|
159
|
+
encode_kwargs: EncodeKwargs,
|
|
160
160
|
hf_split: str,
|
|
161
161
|
hf_subset: str,
|
|
162
162
|
prediction_folder: Path | None = None,
|
|
163
163
|
**kwargs: Any,
|
|
164
164
|
) -> ScoresDict:
|
|
165
|
+
if not isinstance(model, EncoderProtocol):
|
|
166
|
+
raise TypeError(
|
|
167
|
+
"Expected encoder model to be an instance of EncoderProtocol."
|
|
168
|
+
)
|
|
165
169
|
if (
|
|
166
170
|
self.max_document_to_embed is not None
|
|
167
171
|
and self.max_fraction_of_documents_to_embed is not None
|
|
@@ -182,13 +186,13 @@ class AbsTaskClustering(AbsTask):
|
|
|
182
186
|
self.max_fraction_of_documents_to_embed * len(data_split)
|
|
183
187
|
)
|
|
184
188
|
else:
|
|
185
|
-
max_documents_to_embed = self.max_document_to_embed
|
|
189
|
+
max_documents_to_embed = cast(int, self.max_document_to_embed)
|
|
186
190
|
|
|
187
|
-
max_documents_to_embed = min(len(data_split), max_documents_to_embed)
|
|
191
|
+
max_documents_to_embed = min(len(data_split), max_documents_to_embed)
|
|
188
192
|
example_indices = self.rng_state.sample(
|
|
189
193
|
range(len(data_split)), k=max_documents_to_embed
|
|
190
194
|
)
|
|
191
|
-
downsampled_dataset = data_split.select(example_indices)
|
|
195
|
+
downsampled_dataset = data_split.select(example_indices)
|
|
192
196
|
|
|
193
197
|
downsampled_dataset = downsampled_dataset.select_columns(
|
|
194
198
|
[self.input_column_name, self.label_column_name]
|
|
@@ -200,7 +204,7 @@ class AbsTaskClustering(AbsTask):
|
|
|
200
204
|
downsampled_dataset,
|
|
201
205
|
self.metadata,
|
|
202
206
|
input_column=self.input_column_name,
|
|
203
|
-
|
|
207
|
+
**encode_kwargs,
|
|
204
208
|
),
|
|
205
209
|
task_metadata=self.metadata,
|
|
206
210
|
hf_subset=hf_subset,
|
|
@@ -8,8 +8,8 @@ from scipy.optimize import linear_sum_assignment
|
|
|
8
8
|
from sklearn import metrics
|
|
9
9
|
|
|
10
10
|
from mteb._evaluators import ClusteringEvaluator
|
|
11
|
-
from mteb.models import EncoderProtocol
|
|
12
|
-
from mteb.types import ScoresDict
|
|
11
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
12
|
+
from mteb.types import EncodeKwargs, ScoresDict
|
|
13
13
|
from mteb.types.statistics import (
|
|
14
14
|
ImageStatistics,
|
|
15
15
|
LabelStatistics,
|
|
@@ -80,15 +80,21 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
80
80
|
|
|
81
81
|
def _evaluate_subset(
|
|
82
82
|
self,
|
|
83
|
-
model:
|
|
83
|
+
model: MTEBModels,
|
|
84
84
|
data_split: Dataset,
|
|
85
85
|
*,
|
|
86
|
-
encode_kwargs:
|
|
86
|
+
encode_kwargs: EncodeKwargs,
|
|
87
87
|
hf_split: str,
|
|
88
88
|
hf_subset: str,
|
|
89
89
|
prediction_folder: Path | None = None,
|
|
90
90
|
**kwargs: Any,
|
|
91
91
|
) -> ScoresDict:
|
|
92
|
+
if not isinstance(model, EncoderProtocol):
|
|
93
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
94
|
+
|
|
95
|
+
data_split = data_split.select_columns(
|
|
96
|
+
[self.input_column_name, self.label_column_name]
|
|
97
|
+
)
|
|
92
98
|
# MTEB text clustering requires renaming and eval per subset.
|
|
93
99
|
if self.metadata.modalities == ["text"]:
|
|
94
100
|
all_metrics = []
|
|
@@ -136,9 +142,6 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
136
142
|
}
|
|
137
143
|
return scores
|
|
138
144
|
|
|
139
|
-
data_split = data_split.select_columns(
|
|
140
|
-
[self.input_column_name, self.label_column_name]
|
|
141
|
-
)
|
|
142
145
|
evaluator = self.evaluator(
|
|
143
146
|
data_split,
|
|
144
147
|
input_column_name=self.input_column_name,
|
|
@@ -148,10 +151,10 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
148
151
|
hf_subset=hf_subset,
|
|
149
152
|
**kwargs,
|
|
150
153
|
)
|
|
151
|
-
|
|
154
|
+
evaluate_clusters = evaluator(model, encode_kwargs=encode_kwargs)
|
|
152
155
|
if prediction_folder:
|
|
153
156
|
self._save_task_predictions(
|
|
154
|
-
|
|
157
|
+
evaluate_clusters,
|
|
155
158
|
model,
|
|
156
159
|
prediction_folder,
|
|
157
160
|
hf_subset=hf_subset,
|
|
@@ -160,7 +163,7 @@ class AbsTaskClusteringLegacy(AbsTask):
|
|
|
160
163
|
|
|
161
164
|
return self._compute_metrics(
|
|
162
165
|
data_split[self.label_column_name],
|
|
163
|
-
|
|
166
|
+
evaluate_clusters,
|
|
164
167
|
)
|
|
165
168
|
|
|
166
169
|
def _compute_metrics(
|
|
@@ -12,7 +12,8 @@ from mteb.abstasks._statistics_calculation import (
|
|
|
12
12
|
calculate_text_statistics,
|
|
13
13
|
)
|
|
14
14
|
from mteb.abstasks.abstask import AbsTask
|
|
15
|
-
from mteb.models.models_protocols import EncoderProtocol
|
|
15
|
+
from mteb.models.models_protocols import EncoderProtocol, MTEBModels
|
|
16
|
+
from mteb.types import EncodeKwargs
|
|
16
17
|
from mteb.types.statistics import (
|
|
17
18
|
ImageStatistics,
|
|
18
19
|
SplitDescriptiveStatistics,
|
|
@@ -116,15 +117,17 @@ class AbsTaskImageTextPairClassification(AbsTask):
|
|
|
116
117
|
|
|
117
118
|
def _evaluate_subset(
|
|
118
119
|
self,
|
|
119
|
-
model:
|
|
120
|
+
model: MTEBModels,
|
|
120
121
|
data_split: Dataset,
|
|
121
122
|
*,
|
|
122
|
-
encode_kwargs:
|
|
123
|
+
encode_kwargs: EncodeKwargs,
|
|
123
124
|
hf_split: str,
|
|
124
125
|
hf_subset: str,
|
|
125
126
|
prediction_folder: Path | None = None,
|
|
126
127
|
**kwargs: Any,
|
|
127
128
|
) -> ImageTextPairClassificationMetrics:
|
|
129
|
+
if not isinstance(model, EncoderProtocol):
|
|
130
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
128
131
|
select_columns = []
|
|
129
132
|
for columns in (self.images_column_names, self.texts_column_names):
|
|
130
133
|
if isinstance(columns, str):
|
|
@@ -154,7 +157,7 @@ class AbsTaskImageTextPairClassification(AbsTask):
|
|
|
154
157
|
hf_subset=hf_subset,
|
|
155
158
|
**kwargs,
|
|
156
159
|
)
|
|
157
|
-
scores = evaluator(model, encode_kwargs=encode_kwargs)
|
|
160
|
+
scores: list[torch.Tensor] = evaluator(model, encode_kwargs=encode_kwargs) # type: ignore[assignment]
|
|
158
161
|
if prediction_folder:
|
|
159
162
|
self._save_task_predictions(
|
|
160
163
|
[score.tolist() for score in scores],
|
|
@@ -14,8 +14,10 @@ from sklearn.preprocessing import MultiLabelBinarizer
|
|
|
14
14
|
from typing_extensions import override
|
|
15
15
|
|
|
16
16
|
from mteb._create_dataloaders import create_dataloader
|
|
17
|
+
from mteb._evaluators.classification_metrics import hamming_score
|
|
17
18
|
from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
|
|
18
|
-
from mteb.models import EncoderProtocol
|
|
19
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
20
|
+
from mteb.types import Array, EncodeKwargs
|
|
19
21
|
|
|
20
22
|
from .classification import AbsTaskClassification
|
|
21
23
|
|
|
@@ -23,14 +25,14 @@ logger = logging.getLogger(__name__)
|
|
|
23
25
|
|
|
24
26
|
|
|
25
27
|
def _evaluate_classifier(
|
|
26
|
-
embeddings_train:
|
|
28
|
+
embeddings_train: Array,
|
|
27
29
|
y_train: np.ndarray,
|
|
28
|
-
embeddings_test:
|
|
30
|
+
embeddings_test: Array,
|
|
29
31
|
classifier: SklearnModelProtocol,
|
|
30
32
|
) -> tuple[np.ndarray, SklearnModelProtocol]:
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
return
|
|
33
|
+
classifier_copy: SklearnModelProtocol = clone(classifier)
|
|
34
|
+
classifier_copy.fit(embeddings_train, y_train)
|
|
35
|
+
return classifier_copy.predict(embeddings_test), classifier_copy
|
|
34
36
|
|
|
35
37
|
|
|
36
38
|
class MultilabelClassificationMetrics(TypedDict):
|
|
@@ -40,11 +42,13 @@ class MultilabelClassificationMetrics(TypedDict):
|
|
|
40
42
|
accuracy: Accuracy of the classifier.
|
|
41
43
|
lrap: Label Ranking Average Precision (LRAP) score.
|
|
42
44
|
f1: Macro F1 score.
|
|
45
|
+
hamming: Hamming score (label-based accuracy).
|
|
43
46
|
"""
|
|
44
47
|
|
|
45
48
|
accuracy: float
|
|
46
49
|
lrap: float
|
|
47
50
|
f1: float
|
|
51
|
+
hamming: float
|
|
48
52
|
|
|
49
53
|
|
|
50
54
|
class FullMultilabelClassificationMetrics(MultilabelClassificationMetrics):
|
|
@@ -66,25 +70,28 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
66
70
|
input_column_name: Name of the column containing the input text.
|
|
67
71
|
label_column_name: Name of the column containing the labels.
|
|
68
72
|
samples_per_label: Number of samples to use pr. label. These samples are embedded and a classifier is fit using the labels and samples.
|
|
69
|
-
|
|
73
|
+
evaluator_model: Classifier to use for evaluation. Must implement the SklearnModelProtocol.
|
|
70
74
|
"""
|
|
71
75
|
|
|
72
|
-
|
|
76
|
+
evaluator_model: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5)
|
|
73
77
|
input_column_name: str = "text"
|
|
74
78
|
label_column_name: str = "label"
|
|
75
79
|
|
|
76
80
|
@override
|
|
77
|
-
def _evaluate_subset(
|
|
81
|
+
def _evaluate_subset( # type: ignore[override]
|
|
78
82
|
self,
|
|
79
|
-
model:
|
|
83
|
+
model: MTEBModels,
|
|
80
84
|
data_split: DatasetDict,
|
|
81
85
|
*,
|
|
82
|
-
encode_kwargs:
|
|
86
|
+
encode_kwargs: EncodeKwargs,
|
|
83
87
|
hf_split: str,
|
|
84
88
|
hf_subset: str,
|
|
85
89
|
prediction_folder: Path | None = None,
|
|
86
90
|
**kwargs: Any,
|
|
87
91
|
) -> FullMultilabelClassificationMetrics:
|
|
92
|
+
if not isinstance(model, EncoderProtocol):
|
|
93
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
94
|
+
|
|
88
95
|
if isinstance(data_split, DatasetDict):
|
|
89
96
|
data_split = data_split.select_columns(
|
|
90
97
|
[self.input_column_name, self.label_column_name]
|
|
@@ -112,7 +119,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
112
119
|
unique_train_dataset,
|
|
113
120
|
self.metadata,
|
|
114
121
|
input_column=self.input_column_name,
|
|
115
|
-
|
|
122
|
+
**encode_kwargs,
|
|
116
123
|
)
|
|
117
124
|
|
|
118
125
|
logger.info("Running multilabel classification - Encoding training set...")
|
|
@@ -141,7 +148,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
141
148
|
test_dataset.select_columns(self.input_column_name),
|
|
142
149
|
self.metadata,
|
|
143
150
|
input_column=self.input_column_name,
|
|
144
|
-
|
|
151
|
+
**encode_kwargs,
|
|
145
152
|
)
|
|
146
153
|
|
|
147
154
|
logger.info("Running multilabel classification - Encoding test set...")
|
|
@@ -157,12 +164,12 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
157
164
|
|
|
158
165
|
logger.info("Running multilabel classification - Evaluating classifiers...")
|
|
159
166
|
all_predictions = []
|
|
160
|
-
for
|
|
167
|
+
for _, sample_indices in enumerate(train_samples):
|
|
161
168
|
X_train = np.stack([unique_train_embeddings[idx] for idx in sample_indices])
|
|
162
169
|
y_train = train_split.select(sample_indices)[self.label_column_name]
|
|
163
170
|
y_train = binarizer.transform(y_train)
|
|
164
171
|
y_pred, current_classifier = _evaluate_classifier(
|
|
165
|
-
X_train, y_train, X_test, self.
|
|
172
|
+
X_train, y_train, X_test, self.evaluator_model
|
|
166
173
|
)
|
|
167
174
|
if prediction_folder:
|
|
168
175
|
all_predictions.append(y_pred.tolist())
|
|
@@ -182,19 +189,20 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
182
189
|
)
|
|
183
190
|
|
|
184
191
|
avg_scores: dict[str, Any] = {
|
|
185
|
-
k: np.mean([s[k] for s in scores])
|
|
192
|
+
k: np.mean([s[k] for s in scores]) # type: ignore[literal-required]
|
|
193
|
+
for k in scores[0].keys()
|
|
186
194
|
}
|
|
187
195
|
logger.info("Running multilabel classification - Finished.")
|
|
188
196
|
return FullMultilabelClassificationMetrics(
|
|
189
197
|
scores_per_experiment=scores,
|
|
190
|
-
**avg_scores,
|
|
198
|
+
**avg_scores, # type: ignore[typeddict-item]
|
|
191
199
|
)
|
|
192
200
|
|
|
193
|
-
def _calculate_scores(
|
|
201
|
+
def _calculate_scores( # type: ignore[override]
|
|
194
202
|
self,
|
|
195
203
|
y_test: np.ndarray,
|
|
196
204
|
y_pred: np.ndarray,
|
|
197
|
-
x_test_embedding:
|
|
205
|
+
x_test_embedding: Array,
|
|
198
206
|
current_classifier: SklearnModelProtocol,
|
|
199
207
|
) -> MultilabelClassificationMetrics:
|
|
200
208
|
accuracy = current_classifier.score(x_test_embedding, y_test)
|
|
@@ -207,10 +215,12 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
207
215
|
else:
|
|
208
216
|
lrap = label_ranking_average_precision_score(y_test, y_pred)
|
|
209
217
|
f1 = f1_score(y_test, y_pred, average="macro")
|
|
218
|
+
hamming = hamming_score(y_test, y_pred)
|
|
210
219
|
return MultilabelClassificationMetrics(
|
|
211
220
|
accuracy=accuracy,
|
|
212
221
|
lrap=lrap,
|
|
213
222
|
f1=f1,
|
|
223
|
+
hamming=hamming,
|
|
214
224
|
)
|
|
215
225
|
|
|
216
226
|
def _undersample_data_indices(
|
|
@@ -218,6 +228,8 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
218
228
|
) -> tuple[list[int], list[int]]:
|
|
219
229
|
"""Undersample data to have samples_per_label samples of each label.
|
|
220
230
|
|
|
231
|
+
Currently ensures that each label has at least samples_per_label samples.
|
|
232
|
+
|
|
221
233
|
Returns:
|
|
222
234
|
A tuple containing:
|
|
223
235
|
- List of sampled indices.
|
|
@@ -225,10 +237,9 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
225
237
|
"""
|
|
226
238
|
sample_indices = []
|
|
227
239
|
if idxs is None:
|
|
228
|
-
idxs = np.arange(len(y))
|
|
240
|
+
idxs = list(np.arange(len(y)))
|
|
229
241
|
self.np_rng.shuffle(idxs)
|
|
230
|
-
|
|
231
|
-
label_counter = defaultdict(int)
|
|
242
|
+
label_counter: dict[int, int] = defaultdict(int)
|
|
232
243
|
for i in idxs:
|
|
233
244
|
if any((label_counter[label] < samples_per_label) for label in y[i]):
|
|
234
245
|
sample_indices.append(i)
|
|
@@ -18,7 +18,8 @@ from mteb.abstasks._statistics_calculation import (
|
|
|
18
18
|
)
|
|
19
19
|
from mteb.abstasks.abstask import AbsTask
|
|
20
20
|
from mteb.models.model_meta import ScoringFunction
|
|
21
|
-
from mteb.models.models_protocols import EncoderProtocol
|
|
21
|
+
from mteb.models.models_protocols import EncoderProtocol, MTEBModels
|
|
22
|
+
from mteb.types import EncodeKwargs, PromptType
|
|
22
23
|
from mteb.types.statistics import (
|
|
23
24
|
ImageStatistics,
|
|
24
25
|
LabelStatistics,
|
|
@@ -35,7 +36,7 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics):
|
|
|
35
36
|
Attributes:
|
|
36
37
|
num_samples: number of samples in the dataset.
|
|
37
38
|
number_of_characters: Total number of symbols in the dataset.
|
|
38
|
-
|
|
39
|
+
unique_pairs: Number of unique pairs
|
|
39
40
|
|
|
40
41
|
text1_statistics: Statistics for sentence1
|
|
41
42
|
text2_statistics: Statistics for sentence2
|
|
@@ -43,8 +44,8 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics):
|
|
|
43
44
|
"""
|
|
44
45
|
|
|
45
46
|
num_samples: int
|
|
46
|
-
number_of_characters: int
|
|
47
|
-
unique_pairs: int
|
|
47
|
+
number_of_characters: int | None
|
|
48
|
+
unique_pairs: int | None
|
|
48
49
|
|
|
49
50
|
text1_statistics: TextStatistics | None
|
|
50
51
|
image1_statistics: ImageStatistics | None
|
|
@@ -65,24 +66,31 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
65
66
|
input2_column_name: The name of the column containing the second sentence in the pair.
|
|
66
67
|
label_column_name: The name of the column containing the labels for the pairs. Labels should be 0 or 1.
|
|
67
68
|
abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
|
|
69
|
+
input1_prompt_type: Type of prompt of first input. Used for asymmetric tasks.
|
|
70
|
+
input2_prompt_type: Type of prompt of second input. Used for asymmetric tasks.
|
|
68
71
|
"""
|
|
69
72
|
|
|
70
73
|
abstask_prompt = "Retrieve text that are semantically similar to the given text."
|
|
71
74
|
input1_column_name: str = "sentence1"
|
|
72
75
|
input2_column_name: str = "sentence2"
|
|
73
76
|
label_column_name: str = "labels"
|
|
77
|
+
input1_prompt_type: PromptType | None = None
|
|
78
|
+
input2_prompt_type: PromptType | None = None
|
|
74
79
|
|
|
75
80
|
def _evaluate_subset(
|
|
76
81
|
self,
|
|
77
|
-
model:
|
|
82
|
+
model: MTEBModels,
|
|
78
83
|
data_split: Dataset,
|
|
79
84
|
*,
|
|
80
85
|
hf_split: str,
|
|
81
86
|
hf_subset: str,
|
|
82
|
-
encode_kwargs:
|
|
87
|
+
encode_kwargs: EncodeKwargs,
|
|
83
88
|
prediction_folder: Path | None = None,
|
|
84
89
|
**kwargs,
|
|
85
90
|
) -> dict[str, float]:
|
|
91
|
+
if not isinstance(model, EncoderProtocol):
|
|
92
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
93
|
+
|
|
86
94
|
if self.metadata.modalities == ["text"]:
|
|
87
95
|
# for compatibility with v1 version where datasets were stored in a single row
|
|
88
96
|
data_split = data_split[0] if len(data_split) == 1 else data_split
|
|
@@ -93,6 +101,8 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
93
101
|
task_metadata=self.metadata,
|
|
94
102
|
hf_split=hf_split,
|
|
95
103
|
hf_subset=hf_subset,
|
|
104
|
+
input1_prompt_type=self.input1_prompt_type,
|
|
105
|
+
input2_prompt_type=self.input2_prompt_type,
|
|
96
106
|
**kwargs,
|
|
97
107
|
)
|
|
98
108
|
similarity_scores = evaluator(model, encode_kwargs=encode_kwargs)
|
|
@@ -113,7 +123,7 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
113
123
|
self, similarity_scores: PairClassificationDistances, labels: list[int]
|
|
114
124
|
) -> dict[str, float]:
|
|
115
125
|
logger.info("Computing metrics...")
|
|
116
|
-
|
|
126
|
+
np_labels = np.asarray(labels)
|
|
117
127
|
output_scores = {}
|
|
118
128
|
max_scores = defaultdict(list)
|
|
119
129
|
for short_name, scores, reverse in [
|
|
@@ -135,7 +145,7 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
135
145
|
],
|
|
136
146
|
[ScoringFunction.DOT_PRODUCT.value, similarity_scores["dot_scores"], True],
|
|
137
147
|
]:
|
|
138
|
-
metrics = self._compute_metrics_values(scores,
|
|
148
|
+
metrics = self._compute_metrics_values(scores, np_labels, reverse) # type: ignore[arg-type]
|
|
139
149
|
for metric_name, metric_value in metrics.items():
|
|
140
150
|
output_scores[f"{short_name}_{metric_name}"] = metric_value
|
|
141
151
|
max_scores[metric_name].append(metric_value)
|
|
@@ -230,6 +240,12 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
230
240
|
|
|
231
241
|
def _push_dataset_to_hub(self, repo_name: str) -> None:
|
|
232
242
|
# previously pair classification datasets were stored in a single row
|
|
243
|
+
if self.dataset is None:
|
|
244
|
+
# overall this shouldn't happen as we check for dataset before pushing to hub
|
|
245
|
+
# added here for type checking purposes
|
|
246
|
+
raise RuntimeError(
|
|
247
|
+
"Dataset not loaded. To load dataset run `task.load_data()`."
|
|
248
|
+
)
|
|
233
249
|
if self.metadata.is_multilingual:
|
|
234
250
|
for subset in self.dataset:
|
|
235
251
|
for split in self.dataset[subset]:
|
|
@@ -283,13 +299,13 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
283
299
|
)
|
|
284
300
|
|
|
285
301
|
def _find_best_acc_and_threshold(
|
|
286
|
-
self, scores:
|
|
302
|
+
self, scores: list[float], labels: np.ndarray, high_score_more_similar: bool
|
|
287
303
|
) -> tuple[float, float]:
|
|
288
304
|
rows = list(zip(scores, labels))
|
|
289
305
|
rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
|
|
290
306
|
|
|
291
307
|
max_acc = 0
|
|
292
|
-
best_threshold = -1
|
|
308
|
+
best_threshold = -1.0
|
|
293
309
|
positive_so_far = 0
|
|
294
310
|
remaining_negatives = sum(np.array(labels) == 0)
|
|
295
311
|
|
|
@@ -316,7 +332,7 @@ class AbsTaskPairClassification(AbsTask):
|
|
|
316
332
|
|
|
317
333
|
rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
|
|
318
334
|
|
|
319
|
-
best_f1 = best_precision = best_recall = 0
|
|
335
|
+
best_f1 = best_precision = best_recall = 0.0
|
|
320
336
|
threshold = 0
|
|
321
337
|
nextract = 0
|
|
322
338
|
ncorrect = 0
|
mteb/abstasks/regression.py
CHANGED
|
@@ -84,10 +84,10 @@ class AbsTaskRegression(AbsTaskClassification):
|
|
|
84
84
|
n_samples: Number of samples to use for training the regression model. If the dataset has fewer samples than n_samples, all samples are used.
|
|
85
85
|
abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
|
|
86
86
|
evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LinearRegression`.
|
|
87
|
-
|
|
87
|
+
|
|
88
88
|
"""
|
|
89
89
|
|
|
90
|
-
evaluator: type[
|
|
90
|
+
evaluator: type[SklearnEvaluator] = SklearnEvaluator
|
|
91
91
|
evaluator_model: SklearnModelProtocol = LinearRegression(n_jobs=-1)
|
|
92
92
|
|
|
93
93
|
train_split: str = "train"
|
|
@@ -113,7 +113,7 @@ class AbsTaskRegression(AbsTaskClassification):
|
|
|
113
113
|
)["train"]
|
|
114
114
|
return train_split_sampled, []
|
|
115
115
|
|
|
116
|
-
def _calculate_scores(
|
|
116
|
+
def _calculate_scores( # type: ignore[override]
|
|
117
117
|
self,
|
|
118
118
|
y_test: np.ndarray | list[int],
|
|
119
119
|
y_pred: np.ndarray,
|
|
@@ -183,7 +183,7 @@ class AbsTaskRegression(AbsTaskClassification):
|
|
|
183
183
|
|
|
184
184
|
return dataset_dict
|
|
185
185
|
|
|
186
|
-
def _calculate_descriptive_statistics_from_split(
|
|
186
|
+
def _calculate_descriptive_statistics_from_split( # type: ignore[override]
|
|
187
187
|
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
|
|
188
188
|
) -> RegressionDescriptiveStatistics:
|
|
189
189
|
train_text = []
|