mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +6 -0
- mteb/_create_dataloaders.py +22 -20
- mteb/_evaluators/any_sts_evaluator.py +23 -14
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +3 -3
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
- mteb/_evaluators/pair_classification_evaluator.py +34 -40
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +25 -37
- mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
- mteb/_evaluators/text/summarization_evaluator.py +27 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +105 -0
- mteb/abstasks/_statistics_calculation.py +23 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -12
- mteb/abstasks/clustering.py +20 -16
- mteb/abstasks/clustering_legacy.py +13 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +33 -22
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +14 -4
- mteb/abstasks/task_metadata.py +32 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +77 -16
- mteb/benchmarks/benchmarks/__init__.py +12 -0
- mteb/benchmarks/benchmarks/benchmarks.py +361 -16
- mteb/benchmarks/get_benchmark.py +14 -53
- mteb/cache.py +227 -37
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +71 -62
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +106 -75
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +414 -151
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/load_results.py +12 -12
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +31 -23
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +3 -3
- mteb/models/get_model_meta.py +25 -118
- mteb/models/instruct_wrapper.py +33 -9
- mteb/models/model_implementations/align_models.py +8 -1
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +9 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +101 -17
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +13 -2
- mteb/models/model_implementations/blip_models.py +43 -16
- mteb/models/model_implementations/bm25.py +5 -4
- mteb/models/model_implementations/bmretriever_models.py +10 -4
- mteb/models/model_implementations/cadet_models.py +10 -1
- mteb/models/model_implementations/cde_models.py +25 -4
- mteb/models/model_implementations/clip_models.py +9 -6
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +165 -3
- mteb/models/model_implementations/codesage_models.py +18 -3
- mteb/models/model_implementations/cohere_models.py +13 -6
- mteb/models/model_implementations/cohere_v.py +7 -2
- mteb/models/model_implementations/colpali_models.py +17 -9
- mteb/models/model_implementations/colqwen_models.py +275 -5
- mteb/models/model_implementations/colsmol_models.py +4 -2
- mteb/models/model_implementations/conan_models.py +2 -1
- mteb/models/model_implementations/dino_models.py +194 -23
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +21 -110
- mteb/models/model_implementations/e5_v.py +7 -6
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +67 -9
- mteb/models/model_implementations/facebookai.py +205 -0
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +17 -10
- mteb/models/model_implementations/google_models.py +17 -6
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
- mteb/models/model_implementations/gritlm_models.py +4 -2
- mteb/models/model_implementations/gte_models.py +99 -9
- mteb/models/model_implementations/hinvec_models.py +2 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +256 -3
- mteb/models/model_implementations/jina_clip.py +49 -10
- mteb/models/model_implementations/jina_models.py +222 -11
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +37 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +4 -3
- mteb/models/model_implementations/listconranker.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +9 -6
- mteb/models/model_implementations/llm2vec_models.py +16 -8
- mteb/models/model_implementations/mcinext_models.py +7 -1
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +422 -60
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +15 -4
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +27 -14
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
- mteb/models/model_implementations/nomic_models.py +173 -6
- mteb/models/model_implementations/nomic_models_vision.py +8 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
- mteb/models/model_implementations/nvidia_models.py +155 -20
- mteb/models/model_implementations/octen_models.py +254 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +37 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
- mteb/models/model_implementations/ops_moa_models.py +5 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +9 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +12 -8
- mteb/models/model_implementations/pylate_models.py +46 -12
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +9 -6
- mteb/models/model_implementations/qzhou_models.py +5 -3
- mteb/models/model_implementations/random_baseline.py +19 -24
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +2 -1
- mteb/models/model_implementations/repllama_models.py +5 -3
- mteb/models/model_implementations/rerankers_custom.py +15 -9
- mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +71 -20
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +6 -3
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +177 -18
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +30 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +11 -1
- mteb/models/model_implementations/uae_models.py +8 -1
- mteb/models/model_implementations/vdr_models.py +3 -1
- mteb/models/model_implementations/vi_vn_models.py +45 -6
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +5 -3
- mteb/models/model_implementations/voyage_models.py +99 -0
- mteb/models/model_implementations/voyage_v.py +17 -9
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +498 -29
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
- mteb/models/search_wrappers.py +197 -65
- mteb/models/sentence_transformer_wrapper.py +52 -32
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +114 -65
- mteb/results/model_result.py +63 -26
- mteb/results/task_result.py +117 -77
- mteb/similarity_functions.py +60 -7
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -3
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +2 -3
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +16 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +24 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +19 -2
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
- mteb/models/model_implementations/mxbai_models.py +0 -102
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
mteb/results/model_result.py
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import warnings
|
|
3
|
-
from collections.abc import Callable, Iterable
|
|
4
|
-
from typing import Any, Literal
|
|
5
|
+
from collections.abc import Callable, Iterable
|
|
6
|
+
from typing import Any, Literal, cast
|
|
5
7
|
|
|
6
8
|
import numpy as np
|
|
7
9
|
import pandas as pd
|
|
8
10
|
from pydantic import BaseModel, ConfigDict, Field
|
|
9
|
-
from typing_extensions import
|
|
11
|
+
from typing_extensions import overload
|
|
10
12
|
|
|
11
13
|
from mteb.abstasks.abstask import AbsTask
|
|
12
14
|
from mteb.abstasks.task_metadata import (
|
|
@@ -22,7 +24,7 @@ from mteb.types import (
|
|
|
22
24
|
SplitName,
|
|
23
25
|
)
|
|
24
26
|
|
|
25
|
-
from .task_result import TaskResult
|
|
27
|
+
from .task_result import TaskError, TaskResult
|
|
26
28
|
|
|
27
29
|
logger = logging.getLogger(__name__)
|
|
28
30
|
|
|
@@ -30,7 +32,7 @@ logger = logging.getLogger(__name__)
|
|
|
30
32
|
def _aggregate_and_pivot(
|
|
31
33
|
df: pd.DataFrame,
|
|
32
34
|
columns: list[str],
|
|
33
|
-
aggregation_level: Literal["subset", "split", "task"],
|
|
35
|
+
aggregation_level: Literal["subset", "split", "task", "language"],
|
|
34
36
|
format: Literal["wide", "long"],
|
|
35
37
|
aggregation_fn: Callable[[list[Score]], Any] | None,
|
|
36
38
|
) -> pd.DataFrame:
|
|
@@ -43,6 +45,12 @@ def _aggregate_and_pivot(
|
|
|
43
45
|
elif aggregation_level == "task":
|
|
44
46
|
index_columns = ["task_name"]
|
|
45
47
|
|
|
48
|
+
elif aggregation_level == "language":
|
|
49
|
+
index_columns = ["language"]
|
|
50
|
+
df = df.explode("language").reset_index(
|
|
51
|
+
drop=True
|
|
52
|
+
) # each language in its own row before aggregation
|
|
53
|
+
|
|
46
54
|
# perform aggregation
|
|
47
55
|
if aggregation_fn is None:
|
|
48
56
|
aggregation_fn = np.mean
|
|
@@ -52,7 +60,7 @@ def _aggregate_and_pivot(
|
|
|
52
60
|
index=index_columns,
|
|
53
61
|
columns=columns,
|
|
54
62
|
values="score",
|
|
55
|
-
aggfunc=aggregation_fn,
|
|
63
|
+
aggfunc=aggregation_fn, # type: ignore[arg-type]
|
|
56
64
|
).reset_index()
|
|
57
65
|
elif format == "long":
|
|
58
66
|
return (
|
|
@@ -75,29 +83,31 @@ class ModelResult(BaseModel):
|
|
|
75
83
|
model_revision: str | None
|
|
76
84
|
task_results: list[TaskResult]
|
|
77
85
|
default_modalities: list[Modalities] = Field(
|
|
78
|
-
default_factory=lambda: ["text"], alias="modalities"
|
|
86
|
+
default_factory=lambda: [cast(Modalities, "text")], alias="modalities"
|
|
79
87
|
)
|
|
80
88
|
model_config = (
|
|
81
89
|
ConfigDict( # to free up the name model_* which is otherwise protected
|
|
82
90
|
protected_namespaces=(),
|
|
83
91
|
)
|
|
84
92
|
)
|
|
93
|
+
exceptions: list[TaskError] | None = None
|
|
85
94
|
|
|
86
95
|
def __repr__(self) -> str:
|
|
87
96
|
n_entries = len(self.task_results)
|
|
88
97
|
return f"ModelResult(model_name={self.model_name}, model_revision={self.model_revision}, task_results=[...](#{n_entries}))"
|
|
89
98
|
|
|
90
99
|
@classmethod
|
|
91
|
-
def from_validated(cls, **data: dict[str, Any]) ->
|
|
100
|
+
def from_validated(cls, **data: dict[str, Any]) -> ModelResult:
|
|
92
101
|
"""Create a ModelResult from validated data.
|
|
93
102
|
|
|
94
103
|
Args:
|
|
95
104
|
data: The validated data.
|
|
96
105
|
"""
|
|
97
|
-
data["task_results"] = [
|
|
98
|
-
TaskResult.from_validated(**res)
|
|
106
|
+
data["task_results"] = [ # type: ignore[assignment]
|
|
107
|
+
TaskResult.from_validated(**res) # type: ignore[arg-type]
|
|
108
|
+
for res in data["task_results"]
|
|
99
109
|
]
|
|
100
|
-
return cls.model_construct(**data)
|
|
110
|
+
return cls.model_construct(**data) # type: ignore[arg-type]
|
|
101
111
|
|
|
102
112
|
def _filter_tasks(
|
|
103
113
|
self,
|
|
@@ -107,7 +117,7 @@ class ModelResult(BaseModel):
|
|
|
107
117
|
task_types: list[TaskType] | None = None,
|
|
108
118
|
modalities: list[Modalities] | None = None,
|
|
109
119
|
is_public: bool | None = None,
|
|
110
|
-
) ->
|
|
120
|
+
) -> ModelResult:
|
|
111
121
|
new_task_results = []
|
|
112
122
|
for task_result in self.task_results:
|
|
113
123
|
if (task_names is not None) and (task_result.task_name not in task_names):
|
|
@@ -135,7 +145,7 @@ class ModelResult(BaseModel):
|
|
|
135
145
|
task_results=new_task_results,
|
|
136
146
|
)
|
|
137
147
|
|
|
138
|
-
def select_tasks(self, tasks:
|
|
148
|
+
def select_tasks(self, tasks: Iterable[AbsTask]) -> ModelResult:
|
|
139
149
|
"""Select tasks from the ModelResult based on a list of AbsTask objects.
|
|
140
150
|
|
|
141
151
|
Args:
|
|
@@ -153,6 +163,28 @@ class ModelResult(BaseModel):
|
|
|
153
163
|
task_results=new_task_results,
|
|
154
164
|
)
|
|
155
165
|
|
|
166
|
+
@overload
|
|
167
|
+
def _get_scores(
|
|
168
|
+
self,
|
|
169
|
+
splits: list[SplitName] | None = None,
|
|
170
|
+
languages: list[ISOLanguage | ISOLanguageScript] | None = None,
|
|
171
|
+
scripts: list[ISOLanguageScript] | None = None,
|
|
172
|
+
getter: Callable[[ScoresDict], Score] | None = None,
|
|
173
|
+
aggregation: Callable[[list[Score]], Any] | None = None,
|
|
174
|
+
format: Literal["wide"] = "wide",
|
|
175
|
+
) -> dict: ...
|
|
176
|
+
|
|
177
|
+
@overload
|
|
178
|
+
def _get_scores(
|
|
179
|
+
self,
|
|
180
|
+
splits: list[SplitName] | None = None,
|
|
181
|
+
languages: list[ISOLanguage | ISOLanguageScript] | None = None,
|
|
182
|
+
scripts: list[ISOLanguageScript] | None = None,
|
|
183
|
+
getter: Callable[[ScoresDict], Score] | None = None,
|
|
184
|
+
aggregation: Callable[[list[Score]], Any] | None = None,
|
|
185
|
+
format: Literal["long"] = "long",
|
|
186
|
+
) -> list: ...
|
|
187
|
+
|
|
156
188
|
def _get_scores(
|
|
157
189
|
self,
|
|
158
190
|
splits: list[SplitName] | None = None,
|
|
@@ -170,21 +202,24 @@ class ModelResult(BaseModel):
|
|
|
170
202
|
aggregation = aggregation if aggregation is not None else np.mean
|
|
171
203
|
else:
|
|
172
204
|
use_fast = True
|
|
205
|
+
aggregation = cast(Callable[[list[Score]], Any], aggregation)
|
|
206
|
+
getter = cast(Callable[[ScoresDict], Score], getter)
|
|
207
|
+
|
|
173
208
|
if format == "wide":
|
|
174
209
|
scores = {}
|
|
175
210
|
for res in self.task_results:
|
|
176
211
|
try:
|
|
177
212
|
if use_fast:
|
|
178
213
|
scores[res.task_name] = res._get_score_fast(
|
|
179
|
-
splits=splits,
|
|
180
|
-
languages=languages,
|
|
214
|
+
splits=splits,
|
|
215
|
+
languages=languages,
|
|
181
216
|
)
|
|
182
217
|
else:
|
|
183
218
|
scores[res.task_name] = res.get_score(
|
|
184
219
|
splits=splits,
|
|
185
220
|
languages=languages,
|
|
186
|
-
aggregation=aggregation,
|
|
187
|
-
getter=getter,
|
|
221
|
+
aggregation=aggregation,
|
|
222
|
+
getter=getter,
|
|
188
223
|
scripts=scripts,
|
|
189
224
|
)
|
|
190
225
|
except Exception as e:
|
|
@@ -199,14 +234,14 @@ class ModelResult(BaseModel):
|
|
|
199
234
|
if use_fast:
|
|
200
235
|
score = task_res._get_score_fast(
|
|
201
236
|
splits=splits,
|
|
202
|
-
languages=languages,
|
|
237
|
+
languages=languages,
|
|
203
238
|
)
|
|
204
239
|
else:
|
|
205
240
|
score = task_res.get_score(
|
|
206
241
|
splits=splits,
|
|
207
242
|
languages=languages,
|
|
208
|
-
aggregation=aggregation,
|
|
209
|
-
getter=getter,
|
|
243
|
+
aggregation=aggregation,
|
|
244
|
+
getter=getter,
|
|
210
245
|
scripts=scripts,
|
|
211
246
|
)
|
|
212
247
|
entry = dict(
|
|
@@ -226,7 +261,7 @@ class ModelResult(BaseModel):
|
|
|
226
261
|
)
|
|
227
262
|
return entries
|
|
228
263
|
|
|
229
|
-
def _get_score_for_table(self) -> list[dict[str, str | float]]:
|
|
264
|
+
def _get_score_for_table(self) -> list[dict[str, str | float | list[str]]]:
|
|
230
265
|
scores_data = []
|
|
231
266
|
model_name = self.model_name
|
|
232
267
|
for task_result in self.task_results:
|
|
@@ -238,10 +273,10 @@ class ModelResult(BaseModel):
|
|
|
238
273
|
"model_revision": self.model_revision,
|
|
239
274
|
"task_name": task_name,
|
|
240
275
|
"split": split,
|
|
276
|
+
"language": score_item.get("languages", ["Unknown"]),
|
|
241
277
|
"subset": score_item.get("hf_subset", "default"),
|
|
242
278
|
"score": score_item.get("main_score", None),
|
|
243
279
|
}
|
|
244
|
-
|
|
245
280
|
scores_data.append(row)
|
|
246
281
|
|
|
247
282
|
return scores_data
|
|
@@ -285,7 +320,9 @@ class ModelResult(BaseModel):
|
|
|
285
320
|
scores_data = self._get_score_for_table()
|
|
286
321
|
|
|
287
322
|
if not scores_data:
|
|
288
|
-
|
|
323
|
+
msg = "No scores data available. Returning empty DataFrame."
|
|
324
|
+
logger.warning(msg)
|
|
325
|
+
warnings.warn(msg)
|
|
289
326
|
return pd.DataFrame()
|
|
290
327
|
|
|
291
328
|
# Create DataFrame
|
|
@@ -308,7 +345,7 @@ class ModelResult(BaseModel):
|
|
|
308
345
|
def __hash__(self) -> int:
|
|
309
346
|
return id(self)
|
|
310
347
|
|
|
311
|
-
def __iter__(self) -> Iterable[TaskResult]:
|
|
348
|
+
def __iter__(self) -> Iterable[TaskResult]: # type: ignore[override]
|
|
312
349
|
return iter(self.task_results)
|
|
313
350
|
|
|
314
351
|
def __getitem__(self, index) -> TaskResult:
|
|
@@ -361,13 +398,13 @@ class ModelResult(BaseModel):
|
|
|
361
398
|
return [task_res.task_name for task_res in self.task_results]
|
|
362
399
|
|
|
363
400
|
@property
|
|
364
|
-
def modalities(self) -> list[
|
|
401
|
+
def modalities(self) -> list[Modalities]:
|
|
365
402
|
"""Get all modalities in the task results.
|
|
366
403
|
|
|
367
404
|
Returns:
|
|
368
405
|
A list of modalities in the task results.
|
|
369
406
|
"""
|
|
370
|
-
mods = []
|
|
407
|
+
mods: list[Modalities] = []
|
|
371
408
|
for task_res in self.task_results:
|
|
372
409
|
task_modalities = getattr(task_res, "modalities", [])
|
|
373
410
|
mods.extend(task_modalities)
|
mteb/results/task_result.py
CHANGED
|
@@ -2,9 +2,9 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
|
-
|
|
5
|
+
import warnings
|
|
6
6
|
from collections import defaultdict
|
|
7
|
-
from collections.abc import Callable, Iterable
|
|
7
|
+
from collections.abc import Callable, Iterable, Mapping
|
|
8
8
|
from functools import cached_property
|
|
9
9
|
from importlib.metadata import version
|
|
10
10
|
from pathlib import Path
|
|
@@ -16,8 +16,11 @@ from packaging.version import Version
|
|
|
16
16
|
from pydantic import BaseModel, field_validator
|
|
17
17
|
from typing_extensions import Self
|
|
18
18
|
|
|
19
|
+
from mteb import TaskMetadata
|
|
19
20
|
from mteb._helpful_enum import HelpfulStrEnum
|
|
21
|
+
from mteb.abstasks import AbsTaskClassification
|
|
20
22
|
from mteb.abstasks.abstask import AbsTask
|
|
23
|
+
from mteb.abstasks.task_metadata import TaskDomain
|
|
21
24
|
from mteb.languages import LanguageScripts
|
|
22
25
|
from mteb.models.model_meta import ScoringFunction
|
|
23
26
|
from mteb.types import (
|
|
@@ -39,67 +42,59 @@ class Criteria(HelpfulStrEnum):
|
|
|
39
42
|
DATASET_REVISION = "dataset_revision"
|
|
40
43
|
|
|
41
44
|
|
|
42
|
-
class ScalaNbClassificationDummy:
|
|
45
|
+
class ScalaNbClassificationDummy(AbsTaskClassification):
|
|
43
46
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
44
47
|
|
|
45
|
-
metadata =
|
|
48
|
+
metadata = TaskMetadata(
|
|
46
49
|
name="ScalaNbClassification",
|
|
50
|
+
description="A dummy",
|
|
47
51
|
main_score="accuracy",
|
|
48
52
|
type="Classification",
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
},
|
|
52
|
-
dataset={"revision": "revision_not_applicable"},
|
|
53
|
-
revision="revision_not_applicable",
|
|
53
|
+
eval_langs=["nob-Latn"],
|
|
54
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
54
55
|
)
|
|
55
56
|
|
|
56
57
|
|
|
57
|
-
class ScalaNnClassificationDummy:
|
|
58
|
+
class ScalaNnClassificationDummy(AbsTaskClassification):
|
|
58
59
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
59
60
|
|
|
60
|
-
metadata =
|
|
61
|
+
metadata = TaskMetadata(
|
|
61
62
|
name="ScalaNnClassification",
|
|
63
|
+
description="A dummy",
|
|
62
64
|
main_score="accuracy",
|
|
63
65
|
type="Classification",
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
},
|
|
67
|
-
dataset={"revision": "revision_not_applicable"},
|
|
68
|
-
revision="revision_not_applicable",
|
|
66
|
+
eval_langs=["nob-Latn"],
|
|
67
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
69
68
|
)
|
|
70
69
|
|
|
71
70
|
|
|
72
|
-
class ScalaDaClassificationDummy:
|
|
71
|
+
class ScalaDaClassificationDummy(AbsTaskClassification):
|
|
73
72
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
74
73
|
|
|
75
|
-
metadata =
|
|
74
|
+
metadata = TaskMetadata(
|
|
76
75
|
name="ScalaDaClassification",
|
|
76
|
+
description="A dummy",
|
|
77
77
|
main_score="accuracy",
|
|
78
78
|
type="Classification",
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
},
|
|
82
|
-
dataset={"revision": "revision_not_applicable"},
|
|
83
|
-
revision="revision_not_applicable",
|
|
79
|
+
eval_langs=["dan-Latn"],
|
|
80
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
84
81
|
)
|
|
85
82
|
|
|
86
83
|
|
|
87
|
-
class ScalaSvClassificationDummy:
|
|
84
|
+
class ScalaSvClassificationDummy(AbsTaskClassification):
|
|
88
85
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
89
86
|
|
|
90
|
-
metadata =
|
|
87
|
+
metadata = TaskMetadata(
|
|
91
88
|
name="ScalaSvClassification",
|
|
89
|
+
description="A dummy",
|
|
92
90
|
main_score="accuracy",
|
|
93
91
|
type="Classification",
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
},
|
|
97
|
-
dataset={"revision": "revision_not_applicable"},
|
|
98
|
-
revision="revision_not_applicable",
|
|
92
|
+
eval_langs=["swe-Latn"],
|
|
93
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
99
94
|
)
|
|
100
95
|
|
|
101
96
|
|
|
102
|
-
outdated_tasks = {
|
|
97
|
+
outdated_tasks: dict[str, type[AbsTask]] = {
|
|
103
98
|
"ScalaNbClassification": ScalaNbClassificationDummy,
|
|
104
99
|
"ScalaNnClassification": ScalaNnClassificationDummy,
|
|
105
100
|
"ScalaDaClassification": ScalaDaClassificationDummy,
|
|
@@ -166,10 +161,10 @@ class TaskResult(BaseModel):
|
|
|
166
161
|
def from_task_results(
|
|
167
162
|
cls,
|
|
168
163
|
task: AbsTask | type[AbsTask],
|
|
169
|
-
scores: dict[SplitName,
|
|
164
|
+
scores: dict[SplitName, Mapping[HFSubset, ScoresDict]],
|
|
170
165
|
evaluation_time: float,
|
|
171
166
|
kg_co2_emissions: float | None = None,
|
|
172
|
-
) ->
|
|
167
|
+
) -> TaskResult:
|
|
173
168
|
"""Create a TaskResult from the task and scores.
|
|
174
169
|
|
|
175
170
|
Args:
|
|
@@ -246,12 +241,12 @@ class TaskResult(BaseModel):
|
|
|
246
241
|
return get_task(self.task_name)
|
|
247
242
|
|
|
248
243
|
@property
|
|
249
|
-
def domains(self) -> list[
|
|
244
|
+
def domains(self) -> list[TaskDomain]:
|
|
250
245
|
"""Get the domains of the task."""
|
|
251
246
|
doms = self.task.metadata.domains
|
|
252
247
|
if doms is None:
|
|
253
248
|
doms = []
|
|
254
|
-
return doms
|
|
249
|
+
return doms
|
|
255
250
|
|
|
256
251
|
@property
|
|
257
252
|
def task_type(self) -> str:
|
|
@@ -307,7 +302,7 @@ class TaskResult(BaseModel):
|
|
|
307
302
|
if isinstance(v, dict):
|
|
308
303
|
self._round_scores(v, n)
|
|
309
304
|
elif isinstance(v, float):
|
|
310
|
-
value[i] = round(v, n)
|
|
305
|
+
value[i] = round(v, n) # type: ignore[call-overload]
|
|
311
306
|
|
|
312
307
|
elif isinstance(value, float):
|
|
313
308
|
scores[key] = round(value, n)
|
|
@@ -325,7 +320,7 @@ class TaskResult(BaseModel):
|
|
|
325
320
|
json.dump(json_obj, f, indent=2)
|
|
326
321
|
|
|
327
322
|
@classmethod
|
|
328
|
-
def from_disk(cls, path: Path, load_historic_data: bool = True) ->
|
|
323
|
+
def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult:
|
|
329
324
|
"""Load TaskResult from disk.
|
|
330
325
|
|
|
331
326
|
Args:
|
|
@@ -356,7 +351,7 @@ class TaskResult(BaseModel):
|
|
|
356
351
|
) # assume it is before 1.11.0 if the version is not present
|
|
357
352
|
|
|
358
353
|
try:
|
|
359
|
-
obj = cls.model_validate(data)
|
|
354
|
+
obj: TaskResult = cls.model_validate(data)
|
|
360
355
|
except Exception as e:
|
|
361
356
|
if not pre_1_11_load:
|
|
362
357
|
raise e
|
|
@@ -381,6 +376,7 @@ class TaskResult(BaseModel):
|
|
|
381
376
|
from mteb import get_task
|
|
382
377
|
|
|
383
378
|
task_name = obj.task_name
|
|
379
|
+
task: AbsTask | type[AbsTask]
|
|
384
380
|
if task_name in outdated_tasks:
|
|
385
381
|
task = outdated_tasks[task_name]
|
|
386
382
|
else:
|
|
@@ -393,11 +389,11 @@ class TaskResult(BaseModel):
|
|
|
393
389
|
for key in list(hf_subset_scores.keys()):
|
|
394
390
|
if isinstance(hf_subset_scores[key], dict):
|
|
395
391
|
for k, v in hf_subset_scores[key].items():
|
|
396
|
-
hf_subset_scores[f"{key}_{k}"] = v
|
|
397
|
-
hf_subset_scores.pop(key)
|
|
392
|
+
hf_subset_scores[f"{key}_{k}"] = v # type: ignore[index]
|
|
393
|
+
hf_subset_scores.pop(key) # type: ignore[attr-defined]
|
|
398
394
|
|
|
399
395
|
@classmethod
|
|
400
|
-
def _convert_from_before_v1_11_0(cls, data: dict) ->
|
|
396
|
+
def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult:
|
|
401
397
|
from mteb.get_tasks import _TASKS_REGISTRY
|
|
402
398
|
|
|
403
399
|
# in case the task name is not found in the registry, try to find a lower case version
|
|
@@ -462,7 +458,9 @@ class TaskResult(BaseModel):
|
|
|
462
458
|
if main_score in hf_subset_scores:
|
|
463
459
|
hf_subset_scores["main_score"] = hf_subset_scores[main_score]
|
|
464
460
|
else:
|
|
465
|
-
|
|
461
|
+
msg = f"Main score {main_score} not found in scores"
|
|
462
|
+
logger.warning(msg)
|
|
463
|
+
warnings.warn(msg)
|
|
466
464
|
hf_subset_scores["main_score"] = None
|
|
467
465
|
|
|
468
466
|
# specific fixes:
|
|
@@ -481,7 +479,7 @@ class TaskResult(BaseModel):
|
|
|
481
479
|
scores["test"]["fra-fra"] = scores["test"].pop("fr")
|
|
482
480
|
|
|
483
481
|
result: TaskResult = TaskResult.from_task_results(
|
|
484
|
-
task,
|
|
482
|
+
task,
|
|
485
483
|
scores,
|
|
486
484
|
evaluation_time,
|
|
487
485
|
kg_co2_emissions=None,
|
|
@@ -532,7 +530,7 @@ class TaskResult(BaseModel):
|
|
|
532
530
|
def _get_score_fast(
|
|
533
531
|
self,
|
|
534
532
|
splits: Iterable[str] | None = None,
|
|
535
|
-
languages:
|
|
533
|
+
languages: list[ISOLanguage | ISOLanguageScript] | None = None,
|
|
536
534
|
subsets: Iterable[str] | None = None,
|
|
537
535
|
) -> float:
|
|
538
536
|
"""Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified.
|
|
@@ -581,7 +579,7 @@ class TaskResult(BaseModel):
|
|
|
581
579
|
return val_sum / n_val
|
|
582
580
|
|
|
583
581
|
@classmethod
|
|
584
|
-
def from_validated(cls, **data) ->
|
|
582
|
+
def from_validated(cls, **data) -> TaskResult:
|
|
585
583
|
"""Create a TaskResult from validated data.
|
|
586
584
|
|
|
587
585
|
Returns:
|
|
@@ -592,13 +590,13 @@ class TaskResult(BaseModel):
|
|
|
592
590
|
def __repr__(self) -> str:
|
|
593
591
|
return f"TaskResult(task_name={self.task_name}, scores=...)"
|
|
594
592
|
|
|
595
|
-
def only_main_score(self) ->
|
|
593
|
+
def only_main_score(self) -> TaskResult:
|
|
596
594
|
"""Return a new TaskResult object with only the main score.
|
|
597
595
|
|
|
598
596
|
Returns:
|
|
599
597
|
A new TaskResult object with only the main score.
|
|
600
598
|
"""
|
|
601
|
-
new_scores = {}
|
|
599
|
+
new_scores: dict[str, list[Score]] = {}
|
|
602
600
|
for split in self.scores:
|
|
603
601
|
new_scores[split] = []
|
|
604
602
|
for subset_scores in self.scores[split]:
|
|
@@ -610,10 +608,12 @@ class TaskResult(BaseModel):
|
|
|
610
608
|
}
|
|
611
609
|
)
|
|
612
610
|
new_res = {**self.to_dict(), "scores": new_scores}
|
|
613
|
-
|
|
614
|
-
return new_res
|
|
611
|
+
return TaskResult.from_validated(**new_res)
|
|
615
612
|
|
|
616
|
-
def validate_and_filter_scores(
|
|
613
|
+
def validate_and_filter_scores(
|
|
614
|
+
self,
|
|
615
|
+
task: AbsTask | None = None,
|
|
616
|
+
) -> TaskResult:
|
|
617
617
|
"""Validate and filter the scores against the task metadata.
|
|
618
618
|
|
|
619
619
|
This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata.
|
|
@@ -633,21 +633,23 @@ class TaskResult(BaseModel):
|
|
|
633
633
|
task = get_task(self.task_name)
|
|
634
634
|
|
|
635
635
|
splits = task.eval_splits
|
|
636
|
-
hf_subsets = task.hf_subsets
|
|
637
|
-
hf_subsets = set(hf_subsets)
|
|
636
|
+
hf_subsets = set(task.hf_subsets) # Convert to set once
|
|
638
637
|
|
|
639
|
-
new_scores = {}
|
|
638
|
+
new_scores: dict[str, list[Score]] = {}
|
|
640
639
|
seen_splits = set()
|
|
641
640
|
for split in self.scores:
|
|
642
641
|
if split not in splits:
|
|
643
642
|
continue
|
|
644
|
-
new_scores[split] = []
|
|
645
643
|
seen_subsets = set()
|
|
646
|
-
for
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
644
|
+
# Use list comprehension for better performance
|
|
645
|
+
new_scores[split] = [
|
|
646
|
+
_scores
|
|
647
|
+
for _scores in self.scores[split]
|
|
648
|
+
if _scores["hf_subset"] in hf_subsets
|
|
649
|
+
]
|
|
650
|
+
for _scores in new_scores[split]:
|
|
650
651
|
seen_subsets.add(_scores["hf_subset"])
|
|
652
|
+
|
|
651
653
|
if seen_subsets != hf_subsets:
|
|
652
654
|
missing_subsets = hf_subsets - seen_subsets
|
|
653
655
|
if len(missing_subsets) > 2:
|
|
@@ -656,17 +658,39 @@ class TaskResult(BaseModel):
|
|
|
656
658
|
else:
|
|
657
659
|
missing_subsets_str = str(missing_subsets)
|
|
658
660
|
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
)
|
|
661
|
+
msg = f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
|
|
662
|
+
logger.warning(msg)
|
|
663
|
+
warnings.warn(msg)
|
|
664
|
+
for missing_subset in missing_subsets:
|
|
665
|
+
new_scores[split].append(
|
|
666
|
+
{
|
|
667
|
+
"hf_subset": missing_subset,
|
|
668
|
+
"main_score": np.nan,
|
|
669
|
+
"languages": task.metadata.hf_subsets_to_langscripts.get(
|
|
670
|
+
missing_subset, []
|
|
671
|
+
),
|
|
672
|
+
}
|
|
673
|
+
)
|
|
662
674
|
seen_splits.add(split)
|
|
663
675
|
if seen_splits != set(splits):
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
)
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
676
|
+
msg = f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
|
|
677
|
+
logger.warning(msg)
|
|
678
|
+
warnings.warn(msg)
|
|
679
|
+
for missing_split in set(splits) - seen_splits:
|
|
680
|
+
new_scores[missing_split] = []
|
|
681
|
+
for missing_subset in hf_subsets:
|
|
682
|
+
new_scores[missing_split].append(
|
|
683
|
+
{
|
|
684
|
+
"hf_subset": missing_subset,
|
|
685
|
+
"main_score": np.nan,
|
|
686
|
+
"languages": task.metadata.hf_subsets_to_langscripts.get(
|
|
687
|
+
missing_subset, []
|
|
688
|
+
),
|
|
689
|
+
}
|
|
690
|
+
)
|
|
691
|
+
data = self.model_dump()
|
|
692
|
+
data["scores"] = new_scores
|
|
693
|
+
return type(self).model_construct(**data)
|
|
670
694
|
|
|
671
695
|
def is_mergeable(
|
|
672
696
|
self,
|
|
@@ -698,27 +722,31 @@ class TaskResult(BaseModel):
|
|
|
698
722
|
name = result.metadata.name
|
|
699
723
|
revision = result.metadata.revision
|
|
700
724
|
else:
|
|
725
|
+
msg = "result must be a TaskResult or AbsTask object"
|
|
726
|
+
if raise_error:
|
|
727
|
+
raise ValueError(msg)
|
|
728
|
+
logger.debug(msg)
|
|
701
729
|
return False
|
|
702
730
|
|
|
703
731
|
if self.task_name != name:
|
|
732
|
+
msg = f"Cannot merge TaskResult objects as they are derived from different tasks ({self.task_name} and {name})"
|
|
704
733
|
if raise_error:
|
|
705
|
-
raise ValueError(
|
|
706
|
-
|
|
707
|
-
)
|
|
734
|
+
raise ValueError(msg)
|
|
735
|
+
logger.debug(msg)
|
|
708
736
|
return False
|
|
709
737
|
|
|
710
738
|
if Criteria.MTEB_VERSION in criteria and self.mteb_version != mteb_version:
|
|
739
|
+
msg = f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} (loaded) and {mteb_version} (current))"
|
|
711
740
|
if raise_error:
|
|
712
|
-
raise ValueError(
|
|
713
|
-
|
|
714
|
-
)
|
|
741
|
+
raise ValueError(msg)
|
|
742
|
+
logger.debug(msg)
|
|
715
743
|
return False
|
|
716
744
|
|
|
717
745
|
if Criteria.DATASET_REVISION in criteria and self.dataset_revision != revision:
|
|
746
|
+
msg = f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
|
|
718
747
|
if raise_error:
|
|
719
|
-
raise ValueError(
|
|
720
|
-
|
|
721
|
-
)
|
|
748
|
+
raise ValueError(msg)
|
|
749
|
+
logger.debug(msg)
|
|
722
750
|
return False
|
|
723
751
|
|
|
724
752
|
return True
|
|
@@ -730,7 +758,7 @@ class TaskResult(BaseModel):
|
|
|
730
758
|
"mteb_version",
|
|
731
759
|
"dataset_revision",
|
|
732
760
|
],
|
|
733
|
-
) ->
|
|
761
|
+
) -> TaskResult:
|
|
734
762
|
"""Merges two TaskResult objects.
|
|
735
763
|
|
|
736
764
|
Args:
|
|
@@ -836,3 +864,15 @@ class TaskResult(BaseModel):
|
|
|
836
864
|
)
|
|
837
865
|
)
|
|
838
866
|
return results
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
class TaskError(BaseModel):
|
|
870
|
+
"""A class to represent an error that occurred during the evaluation of a task.
|
|
871
|
+
|
|
872
|
+
Attributes:
|
|
873
|
+
task_name: The name of the MTEB task.
|
|
874
|
+
exception: The error message that occurred during the evaluation.
|
|
875
|
+
"""
|
|
876
|
+
|
|
877
|
+
task_name: str
|
|
878
|
+
exception: str
|