mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +6 -0
- mteb/_create_dataloaders.py +22 -20
- mteb/_evaluators/any_sts_evaluator.py +23 -14
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +3 -3
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
- mteb/_evaluators/pair_classification_evaluator.py +34 -40
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +25 -37
- mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
- mteb/_evaluators/text/summarization_evaluator.py +27 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +105 -0
- mteb/abstasks/_statistics_calculation.py +23 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -12
- mteb/abstasks/clustering.py +20 -16
- mteb/abstasks/clustering_legacy.py +13 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +33 -22
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +14 -4
- mteb/abstasks/task_metadata.py +32 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +77 -16
- mteb/benchmarks/benchmarks/__init__.py +12 -0
- mteb/benchmarks/benchmarks/benchmarks.py +361 -16
- mteb/benchmarks/get_benchmark.py +14 -53
- mteb/cache.py +227 -37
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +71 -62
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +106 -75
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +414 -151
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/load_results.py +12 -12
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +31 -23
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +3 -3
- mteb/models/get_model_meta.py +25 -118
- mteb/models/instruct_wrapper.py +33 -9
- mteb/models/model_implementations/align_models.py +8 -1
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +9 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +101 -17
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +13 -2
- mteb/models/model_implementations/blip_models.py +43 -16
- mteb/models/model_implementations/bm25.py +5 -4
- mteb/models/model_implementations/bmretriever_models.py +10 -4
- mteb/models/model_implementations/cadet_models.py +10 -1
- mteb/models/model_implementations/cde_models.py +25 -4
- mteb/models/model_implementations/clip_models.py +9 -6
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +165 -3
- mteb/models/model_implementations/codesage_models.py +18 -3
- mteb/models/model_implementations/cohere_models.py +13 -6
- mteb/models/model_implementations/cohere_v.py +7 -2
- mteb/models/model_implementations/colpali_models.py +17 -9
- mteb/models/model_implementations/colqwen_models.py +275 -5
- mteb/models/model_implementations/colsmol_models.py +4 -2
- mteb/models/model_implementations/conan_models.py +2 -1
- mteb/models/model_implementations/dino_models.py +194 -23
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +21 -110
- mteb/models/model_implementations/e5_v.py +7 -6
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +67 -9
- mteb/models/model_implementations/facebookai.py +205 -0
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +17 -10
- mteb/models/model_implementations/google_models.py +17 -6
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
- mteb/models/model_implementations/gritlm_models.py +4 -2
- mteb/models/model_implementations/gte_models.py +99 -9
- mteb/models/model_implementations/hinvec_models.py +2 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +256 -3
- mteb/models/model_implementations/jina_clip.py +49 -10
- mteb/models/model_implementations/jina_models.py +222 -11
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +37 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +4 -3
- mteb/models/model_implementations/listconranker.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +9 -6
- mteb/models/model_implementations/llm2vec_models.py +16 -8
- mteb/models/model_implementations/mcinext_models.py +7 -1
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +422 -60
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +15 -4
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +27 -14
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
- mteb/models/model_implementations/nomic_models.py +173 -6
- mteb/models/model_implementations/nomic_models_vision.py +8 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
- mteb/models/model_implementations/nvidia_models.py +155 -20
- mteb/models/model_implementations/octen_models.py +254 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +37 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
- mteb/models/model_implementations/ops_moa_models.py +5 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +9 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +12 -8
- mteb/models/model_implementations/pylate_models.py +46 -12
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +9 -6
- mteb/models/model_implementations/qzhou_models.py +5 -3
- mteb/models/model_implementations/random_baseline.py +19 -24
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +2 -1
- mteb/models/model_implementations/repllama_models.py +5 -3
- mteb/models/model_implementations/rerankers_custom.py +15 -9
- mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +71 -20
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +6 -3
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +177 -18
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +30 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +11 -1
- mteb/models/model_implementations/uae_models.py +8 -1
- mteb/models/model_implementations/vdr_models.py +3 -1
- mteb/models/model_implementations/vi_vn_models.py +45 -6
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +5 -3
- mteb/models/model_implementations/voyage_models.py +99 -0
- mteb/models/model_implementations/voyage_v.py +17 -9
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +498 -29
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
- mteb/models/search_wrappers.py +197 -65
- mteb/models/sentence_transformer_wrapper.py +52 -32
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +114 -65
- mteb/results/model_result.py +63 -26
- mteb/results/task_result.py +117 -77
- mteb/similarity_functions.py +60 -7
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -3
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +2 -3
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +16 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +24 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +19 -2
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
- mteb/models/model_implementations/mxbai_models.py +0 -102
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import TypedDict
|
|
4
4
|
|
|
5
5
|
import torch
|
|
6
6
|
from datasets import Dataset
|
|
7
7
|
from sklearn import metrics
|
|
8
8
|
|
|
9
9
|
from mteb._evaluators import ZeroShotClassificationEvaluator
|
|
10
|
-
from mteb.models import EncoderProtocol
|
|
10
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
11
|
+
from mteb.types import EncodeKwargs
|
|
11
12
|
from mteb.types.statistics import (
|
|
12
13
|
ImageStatistics,
|
|
13
14
|
LabelStatistics,
|
|
@@ -111,15 +112,18 @@ class AbsTaskZeroShotClassification(AbsTask):
|
|
|
111
112
|
|
|
112
113
|
def _evaluate_subset(
|
|
113
114
|
self,
|
|
114
|
-
model:
|
|
115
|
+
model: MTEBModels,
|
|
115
116
|
data_split: Dataset,
|
|
116
117
|
*,
|
|
117
118
|
hf_split: str,
|
|
118
119
|
hf_subset: str,
|
|
119
|
-
encode_kwargs:
|
|
120
|
+
encode_kwargs: EncodeKwargs,
|
|
120
121
|
prediction_folder: Path | None = None,
|
|
121
122
|
**kwargs,
|
|
122
123
|
) -> ZeroShotClassificationMetrics:
|
|
124
|
+
if not isinstance(model, EncoderProtocol):
|
|
125
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
126
|
+
|
|
123
127
|
candidate_labels = self.get_candidate_labels()
|
|
124
128
|
data_split = data_split.select_columns(
|
|
125
129
|
[self.input_column_name, self.label_column_name]
|
mteb/benchmarks/_create_table.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import math
|
|
2
1
|
import re
|
|
3
2
|
from collections import defaultdict
|
|
3
|
+
from typing import Literal
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
@@ -32,26 +32,18 @@ def _split_on_capital(s: str) -> str:
|
|
|
32
32
|
return " ".join(re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", s))
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
def _format_n_parameters(n_parameters) ->
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
n_zeros = math.log10(n_thousand)
|
|
42
|
-
if n_zeros >= 6:
|
|
43
|
-
return str(n_thousand // (10**6)) + "B"
|
|
44
|
-
if n_zeros >= 3:
|
|
45
|
-
return str(n_thousand // (10**3)) + "M"
|
|
46
|
-
return str(n_thousand) + "K"
|
|
35
|
+
def _format_n_parameters(n_parameters) -> float | None:
|
|
36
|
+
"""Format n_parameters to be in billions with decimals down to 1 million. I.e. 7M -> 0.007B, 1.5B -> 1.5B, None -> None"""
|
|
37
|
+
if n_parameters:
|
|
38
|
+
n_parameters = float(n_parameters)
|
|
39
|
+
return round(n_parameters / 1e9, 3)
|
|
40
|
+
return None
|
|
47
41
|
|
|
48
42
|
|
|
49
|
-
def _format_max_tokens(max_tokens: float | None) ->
|
|
50
|
-
if max_tokens is None:
|
|
51
|
-
return
|
|
52
|
-
|
|
53
|
-
return "Infinite"
|
|
54
|
-
return str(int(max_tokens))
|
|
43
|
+
def _format_max_tokens(max_tokens: float | None) -> float | None:
|
|
44
|
+
if max_tokens is None or max_tokens == np.inf:
|
|
45
|
+
return None
|
|
46
|
+
return float(max_tokens)
|
|
55
47
|
|
|
56
48
|
|
|
57
49
|
def _get_means_per_types(per_task: pd.DataFrame):
|
|
@@ -144,18 +136,18 @@ def _create_summary_table_from_benchmark_results(
|
|
|
144
136
|
joint_table.insert(
|
|
145
137
|
1,
|
|
146
138
|
"Embedding Dimensions",
|
|
147
|
-
model_metas.map(lambda m:
|
|
139
|
+
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
|
|
148
140
|
)
|
|
149
141
|
joint_table.insert(
|
|
150
142
|
1,
|
|
151
|
-
"Number of Parameters",
|
|
143
|
+
"Number of Parameters (B)",
|
|
152
144
|
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
|
|
153
145
|
)
|
|
154
146
|
joint_table.insert(
|
|
155
147
|
1,
|
|
156
148
|
"Memory Usage (MB)",
|
|
157
149
|
model_metas.map(
|
|
158
|
-
lambda m:
|
|
150
|
+
lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
|
|
159
151
|
),
|
|
160
152
|
)
|
|
161
153
|
|
|
@@ -250,6 +242,65 @@ def _create_per_task_table_from_benchmark_results(
|
|
|
250
242
|
return per_task
|
|
251
243
|
|
|
252
244
|
|
|
245
|
+
def _create_per_language_table_from_benchmark_results(
|
|
246
|
+
benchmark_results: BenchmarkResults,
|
|
247
|
+
language_view: list[str] | Literal["all"],
|
|
248
|
+
) -> pd.DataFrame:
|
|
249
|
+
"""Create per-language table from BenchmarkResults.
|
|
250
|
+
|
|
251
|
+
Returns a DataFrame with one row per model and one column per language.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
benchmark_results: BenchmarkResults object containing model results
|
|
255
|
+
language_view: List of languages to include in the per-language table, or "all" for all languages present in the results
|
|
256
|
+
Returns:
|
|
257
|
+
DataFrame with per-language scores, ready for styling in the leaderboard
|
|
258
|
+
"""
|
|
259
|
+
if language_view != "all" and not isinstance(language_view, list):
|
|
260
|
+
raise ValueError("language_view must be a list of languages or 'all'")
|
|
261
|
+
|
|
262
|
+
data = benchmark_results.to_dataframe(aggregation_level="language", format="long")
|
|
263
|
+
|
|
264
|
+
if data.empty:
|
|
265
|
+
no_results_frame = pd.DataFrame(
|
|
266
|
+
{"No results": ["You can try relaxing your criteria"]}
|
|
267
|
+
)
|
|
268
|
+
return no_results_frame
|
|
269
|
+
|
|
270
|
+
if language_view != "all":
|
|
271
|
+
data = data[data["language"].isin(language_view)]
|
|
272
|
+
|
|
273
|
+
per_language = data.pivot_table(
|
|
274
|
+
index="model_name", columns="language", values="score", aggfunc="mean"
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
to_remove = per_language.isna().all(axis="columns")
|
|
278
|
+
if to_remove.all():
|
|
279
|
+
no_results_frame = pd.DataFrame(
|
|
280
|
+
{"No results": ["You can try relaxing your criteria"]}
|
|
281
|
+
)
|
|
282
|
+
return no_results_frame
|
|
283
|
+
|
|
284
|
+
models_to_remove = list(per_language[to_remove].index)
|
|
285
|
+
per_language = per_language.drop(models_to_remove, axis=0)
|
|
286
|
+
|
|
287
|
+
per_language["borda_rank"] = _get_borda_rank(per_language)
|
|
288
|
+
per_language = per_language.sort_values("borda_rank", ascending=True)
|
|
289
|
+
per_language = per_language.drop(columns=["borda_rank"])
|
|
290
|
+
per_language = per_language.reset_index()
|
|
291
|
+
|
|
292
|
+
per_language["model_name"] = per_language["model_name"].map(
|
|
293
|
+
lambda name: name.split("/")[-1]
|
|
294
|
+
)
|
|
295
|
+
per_language = per_language.rename(
|
|
296
|
+
columns={
|
|
297
|
+
"model_name": "Model",
|
|
298
|
+
}
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
return per_language
|
|
302
|
+
|
|
303
|
+
|
|
253
304
|
def _create_summary_table_mean_public_private(
|
|
254
305
|
benchmark_results: BenchmarkResults,
|
|
255
306
|
) -> pd.DataFrame:
|
|
@@ -323,18 +374,18 @@ def _create_summary_table_mean_public_private(
|
|
|
323
374
|
joint_table.insert(
|
|
324
375
|
1,
|
|
325
376
|
"Embedding Dimensions",
|
|
326
|
-
model_metas.map(lambda m:
|
|
377
|
+
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
|
|
327
378
|
)
|
|
328
379
|
joint_table.insert(
|
|
329
380
|
1,
|
|
330
|
-
"Number of Parameters",
|
|
381
|
+
"Number of Parameters (B)",
|
|
331
382
|
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
|
|
332
383
|
)
|
|
333
384
|
joint_table.insert(
|
|
334
385
|
1,
|
|
335
386
|
"Memory Usage (MB)",
|
|
336
387
|
model_metas.map(
|
|
337
|
-
lambda m:
|
|
388
|
+
lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
|
|
338
389
|
),
|
|
339
390
|
)
|
|
340
391
|
|
|
@@ -358,9 +409,7 @@ def _create_summary_table_mean_public_private(
|
|
|
358
409
|
"mean(public)": "Mean (Public)",
|
|
359
410
|
"mean(private)": "Mean (Private)",
|
|
360
411
|
}
|
|
361
|
-
|
|
362
|
-
if "Retrieval" in joint_table.columns:
|
|
363
|
-
rename_dict["Retrieval"] = "Mean (Task)"
|
|
412
|
+
|
|
364
413
|
joint_table = joint_table.rename(columns=rename_dict)
|
|
365
414
|
|
|
366
415
|
# Move borda rank to front
|
|
@@ -447,18 +496,18 @@ def _create_summary_table_mean_subset(
|
|
|
447
496
|
joint_table.insert(
|
|
448
497
|
1,
|
|
449
498
|
"Embedding Dimensions",
|
|
450
|
-
model_metas.map(lambda m:
|
|
499
|
+
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
|
|
451
500
|
)
|
|
452
501
|
joint_table.insert(
|
|
453
502
|
1,
|
|
454
|
-
"Number of Parameters",
|
|
503
|
+
"Number of Parameters (B)",
|
|
455
504
|
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
|
|
456
505
|
)
|
|
457
506
|
joint_table.insert(
|
|
458
507
|
1,
|
|
459
508
|
"Memory Usage (MB)",
|
|
460
509
|
model_metas.map(
|
|
461
|
-
lambda m:
|
|
510
|
+
lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
|
|
462
511
|
),
|
|
463
512
|
)
|
|
464
513
|
|
|
@@ -560,25 +609,23 @@ def _create_summary_table_mean_task_type(
|
|
|
560
609
|
|
|
561
610
|
# Insert model metadata columns
|
|
562
611
|
joint_table.insert(
|
|
563
|
-
1,
|
|
564
|
-
"Max Tokens",
|
|
565
|
-
model_metas.map(lambda m: _format_max_tokens(m.max_tokens)),
|
|
612
|
+
1, "Max Tokens", model_metas.map(lambda m: _format_max_tokens(m.max_tokens))
|
|
566
613
|
)
|
|
567
614
|
joint_table.insert(
|
|
568
615
|
1,
|
|
569
616
|
"Embedding Dimensions",
|
|
570
|
-
model_metas.map(lambda m:
|
|
617
|
+
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
|
|
571
618
|
)
|
|
572
619
|
joint_table.insert(
|
|
573
620
|
1,
|
|
574
|
-
"Number of Parameters",
|
|
621
|
+
"Number of Parameters (B)",
|
|
575
622
|
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
|
|
576
623
|
)
|
|
577
624
|
joint_table.insert(
|
|
578
625
|
1,
|
|
579
626
|
"Memory Usage (MB)",
|
|
580
627
|
model_metas.map(
|
|
581
|
-
lambda m:
|
|
628
|
+
lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
|
|
582
629
|
),
|
|
583
630
|
)
|
|
584
631
|
|
mteb/benchmarks/benchmark.py
CHANGED
|
@@ -1,21 +1,16 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator, Sequence
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import TYPE_CHECKING, Literal
|
|
4
6
|
|
|
5
7
|
import pandas as pd
|
|
6
8
|
|
|
7
|
-
from mteb.
|
|
8
|
-
_create_per_task_table_from_benchmark_results,
|
|
9
|
-
_create_summary_table_from_benchmark_results,
|
|
10
|
-
_create_summary_table_mean_public_private,
|
|
11
|
-
_create_summary_table_mean_subset,
|
|
12
|
-
_create_summary_table_mean_task_type,
|
|
13
|
-
)
|
|
14
|
-
from mteb.results import BenchmarkResults
|
|
9
|
+
from mteb.abstasks.abstask import AbsTask
|
|
15
10
|
from mteb.types import StrURL
|
|
16
11
|
|
|
17
12
|
if TYPE_CHECKING:
|
|
18
|
-
from mteb.
|
|
13
|
+
from mteb.results import BenchmarkResults
|
|
19
14
|
|
|
20
15
|
|
|
21
16
|
@dataclass
|
|
@@ -24,6 +19,7 @@ class Benchmark:
|
|
|
24
19
|
|
|
25
20
|
Args:
|
|
26
21
|
name: The name of the benchmark
|
|
22
|
+
aliases: Alternative names for the benchmark
|
|
27
23
|
tasks: The tasks within the benchmark.
|
|
28
24
|
description: A description of the benchmark, should include its intended goal and potentially a description of its construction
|
|
29
25
|
reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github.
|
|
@@ -42,7 +38,8 @@ class Benchmark:
|
|
|
42
38
|
"""
|
|
43
39
|
|
|
44
40
|
name: str
|
|
45
|
-
tasks: Sequence[
|
|
41
|
+
tasks: Sequence[AbsTask]
|
|
42
|
+
aliases: Sequence[str] = field(default_factory=tuple)
|
|
46
43
|
description: str | None = None
|
|
47
44
|
reference: StrURL | None = None
|
|
48
45
|
citation: str | None = None
|
|
@@ -50,14 +47,15 @@ class Benchmark:
|
|
|
50
47
|
display_on_leaderboard: bool = True
|
|
51
48
|
icon: str | None = None
|
|
52
49
|
display_name: str | None = None
|
|
50
|
+
language_view: list[str] | Literal["all"] = field(default_factory=list)
|
|
53
51
|
|
|
54
|
-
def __iter__(self) ->
|
|
52
|
+
def __iter__(self) -> Iterator[AbsTask]:
|
|
55
53
|
return iter(self.tasks)
|
|
56
54
|
|
|
57
55
|
def __len__(self) -> int:
|
|
58
56
|
return len(self.tasks)
|
|
59
57
|
|
|
60
|
-
def __getitem__(self, index: int) ->
|
|
58
|
+
def __getitem__(self, index: int) -> AbsTask:
|
|
61
59
|
return self.tasks[index]
|
|
62
60
|
|
|
63
61
|
def _create_summary_table(
|
|
@@ -68,6 +66,10 @@ class Benchmark:
|
|
|
68
66
|
Returns:
|
|
69
67
|
A pandas DataFrame representing the summary results.
|
|
70
68
|
"""
|
|
69
|
+
from mteb.benchmarks._create_table import (
|
|
70
|
+
_create_summary_table_from_benchmark_results,
|
|
71
|
+
)
|
|
72
|
+
|
|
71
73
|
return _create_summary_table_from_benchmark_results(benchmark_results)
|
|
72
74
|
|
|
73
75
|
def _create_per_task_table(
|
|
@@ -78,8 +80,38 @@ class Benchmark:
|
|
|
78
80
|
Returns:
|
|
79
81
|
A pandas DataFrame representing the per-task results.
|
|
80
82
|
"""
|
|
83
|
+
from mteb.benchmarks._create_table import (
|
|
84
|
+
_create_per_task_table_from_benchmark_results,
|
|
85
|
+
)
|
|
86
|
+
|
|
81
87
|
return _create_per_task_table_from_benchmark_results(benchmark_results)
|
|
82
88
|
|
|
89
|
+
def _create_per_language_table(
|
|
90
|
+
self, benchmark_results: BenchmarkResults
|
|
91
|
+
) -> pd.DataFrame:
|
|
92
|
+
"""Create per-language table. Called by the leaderboard app.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
A pandas DataFrame representing the per-language results.
|
|
96
|
+
"""
|
|
97
|
+
from mteb.benchmarks._create_table import (
|
|
98
|
+
_create_per_language_table_from_benchmark_results,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if self.language_view == "all" or len(self.language_view) > 0:
|
|
102
|
+
return _create_per_language_table_from_benchmark_results(
|
|
103
|
+
benchmark_results, self.language_view
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
no_results_frame = pd.DataFrame(
|
|
107
|
+
{
|
|
108
|
+
"No results": [
|
|
109
|
+
"The per-language table is not available for this benchmark."
|
|
110
|
+
]
|
|
111
|
+
}
|
|
112
|
+
)
|
|
113
|
+
return no_results_frame
|
|
114
|
+
|
|
83
115
|
|
|
84
116
|
class RtebBenchmark(Benchmark):
|
|
85
117
|
"""Wrapper for RTEB benchmark."""
|
|
@@ -87,7 +119,14 @@ class RtebBenchmark(Benchmark):
|
|
|
87
119
|
def _create_summary_table(
|
|
88
120
|
self, benchmark_results: BenchmarkResults
|
|
89
121
|
) -> pd.DataFrame:
|
|
90
|
-
|
|
122
|
+
from mteb.benchmarks._create_table import (
|
|
123
|
+
_create_summary_table_mean_public_private,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
joint_table = _create_summary_table_mean_public_private(benchmark_results)
|
|
127
|
+
# For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
|
|
128
|
+
joint_table = joint_table.rename(columns={"Retrieval": "Mean (Task)"})
|
|
129
|
+
return joint_table
|
|
91
130
|
|
|
92
131
|
|
|
93
132
|
class HUMEBenchmark(Benchmark):
|
|
@@ -96,6 +135,8 @@ class HUMEBenchmark(Benchmark):
|
|
|
96
135
|
def _create_summary_table(
|
|
97
136
|
self, benchmark_results: BenchmarkResults
|
|
98
137
|
) -> pd.DataFrame:
|
|
138
|
+
from mteb.benchmarks._create_table import _create_summary_table_mean_subset
|
|
139
|
+
|
|
99
140
|
return _create_summary_table_mean_subset(benchmark_results)
|
|
100
141
|
|
|
101
142
|
|
|
@@ -105,4 +146,24 @@ class MIEBBenchmark(Benchmark):
|
|
|
105
146
|
def _create_summary_table(
|
|
106
147
|
self, benchmark_results: BenchmarkResults
|
|
107
148
|
) -> pd.DataFrame:
|
|
149
|
+
from mteb.benchmarks._create_table import _create_summary_table_mean_task_type
|
|
150
|
+
|
|
108
151
|
return _create_summary_table_mean_task_type(benchmark_results)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class VidoreBenchmark(Benchmark):
|
|
155
|
+
"""Wrapper for Vidore3 benchmark."""
|
|
156
|
+
|
|
157
|
+
def _create_summary_table(
|
|
158
|
+
self, benchmark_results: BenchmarkResults
|
|
159
|
+
) -> pd.DataFrame:
|
|
160
|
+
from mteb.benchmarks._create_table import (
|
|
161
|
+
_create_summary_table_mean_public_private,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
joint_table = _create_summary_table_mean_public_private(benchmark_results)
|
|
165
|
+
# For ViDoRe (V1, V2, V3): all tasks are Document Understanding type, so Document Understanding column = Mean (Task)
|
|
166
|
+
joint_table = joint_table.rename(
|
|
167
|
+
columns={"Document Understanding": "Mean (Task)"}
|
|
168
|
+
)
|
|
169
|
+
return joint_table
|
|
@@ -6,12 +6,16 @@ from mteb.benchmarks.benchmarks.benchmarks import (
|
|
|
6
6
|
BUILT_MTEB,
|
|
7
7
|
C_MTEB,
|
|
8
8
|
CHEMTEB,
|
|
9
|
+
CHEMTEB_V1_1,
|
|
9
10
|
CODE_RAG,
|
|
10
11
|
ENCODECHKA,
|
|
11
12
|
FA_MTEB,
|
|
12
13
|
FA_MTEB_2,
|
|
13
14
|
HUME,
|
|
14
15
|
JINA_VDR,
|
|
16
|
+
JMTEB_LITE_V1,
|
|
17
|
+
JMTEB_V2,
|
|
18
|
+
KOVIDORE_V2,
|
|
15
19
|
LONG_EMBED,
|
|
16
20
|
MIEB_ENG,
|
|
17
21
|
MIEB_IMG,
|
|
@@ -38,10 +42,12 @@ from mteb.benchmarks.benchmarks.benchmarks import (
|
|
|
38
42
|
SEB,
|
|
39
43
|
VIDORE,
|
|
40
44
|
VIDORE_V2,
|
|
45
|
+
VIDORE_V3,
|
|
41
46
|
VISUAL_DOCUMENT_RETRIEVAL,
|
|
42
47
|
VN_MTEB,
|
|
43
48
|
CoIR,
|
|
44
49
|
MTEB_code,
|
|
50
|
+
MTEB_MAIN_RU_v1_1,
|
|
45
51
|
MTEB_multilingual_v1,
|
|
46
52
|
MTEB_multilingual_v2,
|
|
47
53
|
RAR_b,
|
|
@@ -65,6 +71,7 @@ __all__ = [
|
|
|
65
71
|
"BRIGHT_LONG",
|
|
66
72
|
"BUILT_MTEB",
|
|
67
73
|
"CHEMTEB",
|
|
74
|
+
"CHEMTEB_V1_1",
|
|
68
75
|
"CODE_RAG",
|
|
69
76
|
"C_MTEB",
|
|
70
77
|
"ENCODECHKA",
|
|
@@ -73,6 +80,9 @@ __all__ = [
|
|
|
73
80
|
"HUME",
|
|
74
81
|
"HUME",
|
|
75
82
|
"JINA_VDR",
|
|
83
|
+
"JMTEB_LITE_V1",
|
|
84
|
+
"JMTEB_V2",
|
|
85
|
+
"KOVIDORE_V2",
|
|
76
86
|
"LONG_EMBED",
|
|
77
87
|
"MIEB_ENG",
|
|
78
88
|
"MIEB_IMG",
|
|
@@ -108,9 +118,11 @@ __all__ = [
|
|
|
108
118
|
"SEB",
|
|
109
119
|
"VIDORE",
|
|
110
120
|
"VIDORE_V2",
|
|
121
|
+
"VIDORE_V3",
|
|
111
122
|
"VISUAL_DOCUMENT_RETRIEVAL",
|
|
112
123
|
"VN_MTEB",
|
|
113
124
|
"CoIR",
|
|
125
|
+
"MTEB_MAIN_RU_v1_1",
|
|
114
126
|
"MTEB_code",
|
|
115
127
|
"MTEB_multilingual_v1",
|
|
116
128
|
"MTEB_multilingual_v2",
|