mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +4 -0
- mteb/_create_dataloaders.py +6 -3
- mteb/_evaluators/any_sts_evaluator.py +21 -12
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +1 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
- mteb/_evaluators/pair_classification_evaluator.py +30 -38
- mteb/_evaluators/sklearn_evaluator.py +15 -28
- mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
- mteb/_evaluators/text/summarization_evaluator.py +4 -2
- mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +102 -0
- mteb/abstasks/_statistics_calculation.py +6 -2
- mteb/abstasks/classification.py +0 -2
- mteb/abstasks/clustering.py +1 -1
- mteb/abstasks/clustering_legacy.py +3 -0
- mteb/abstasks/multilabel_classification.py +10 -3
- mteb/abstasks/pair_classification.py +8 -1
- mteb/abstasks/sts.py +7 -0
- mteb/abstasks/task_metadata.py +1 -0
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +74 -15
- mteb/benchmarks/benchmarks/__init__.py +8 -0
- mteb/benchmarks/benchmarks/benchmarks.py +259 -15
- mteb/benchmarks/get_benchmark.py +2 -0
- mteb/cache.py +47 -10
- mteb/deprecated_evaluator.py +8 -13
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/evaluate.py +65 -45
- mteb/leaderboard/app.py +268 -133
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +21 -17
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
- mteb/models/cache_wrappers/cache_wrapper.py +1 -1
- mteb/models/get_model_meta.py +3 -114
- mteb/models/instruct_wrapper.py +5 -1
- mteb/models/model_implementations/align_models.py +7 -0
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +8 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +60 -0
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +11 -0
- mteb/models/model_implementations/blip_models.py +27 -0
- mteb/models/model_implementations/bm25.py +1 -0
- mteb/models/model_implementations/bmretriever_models.py +4 -0
- mteb/models/model_implementations/cadet_models.py +9 -0
- mteb/models/model_implementations/cde_models.py +14 -0
- mteb/models/model_implementations/clip_models.py +3 -0
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +162 -0
- mteb/models/model_implementations/codesage_models.py +15 -0
- mteb/models/model_implementations/cohere_models.py +8 -1
- mteb/models/model_implementations/cohere_v.py +5 -0
- mteb/models/model_implementations/colpali_models.py +14 -6
- mteb/models/model_implementations/colqwen_models.py +271 -1
- mteb/models/model_implementations/colsmol_models.py +2 -0
- mteb/models/model_implementations/conan_models.py +1 -0
- mteb/models/model_implementations/dino_models.py +171 -0
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +12 -101
- mteb/models/model_implementations/e5_v.py +1 -0
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +58 -0
- mteb/models/model_implementations/facebookai.py +193 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +11 -5
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
- mteb/models/model_implementations/gritlm_models.py +2 -0
- mteb/models/model_implementations/gte_models.py +78 -0
- mteb/models/model_implementations/hinvec_models.py +1 -0
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +255 -2
- mteb/models/model_implementations/jina_clip.py +1 -0
- mteb/models/model_implementations/jina_models.py +209 -5
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +31 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +3 -2
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +3 -0
- mteb/models/model_implementations/llm2vec_models.py +8 -0
- mteb/models/model_implementations/mcinext_models.py +3 -0
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +362 -0
- mteb/models/model_implementations/mme5_models.py +1 -0
- mteb/models/model_implementations/moco_models.py +11 -0
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +13 -0
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/mxbai_models.py +9 -0
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
- mteb/models/model_implementations/nomic_models.py +156 -4
- mteb/models/model_implementations/nomic_models_vision.py +7 -2
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
- mteb/models/model_implementations/nvidia_models.py +4 -1
- mteb/models/model_implementations/octen_models.py +195 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +24 -0
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
- mteb/models/model_implementations/ops_moa_models.py +4 -2
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +8 -0
- mteb/models/model_implementations/promptriever_models.py +8 -4
- mteb/models/model_implementations/pylate_models.py +37 -4
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +6 -3
- mteb/models/model_implementations/qzhou_models.py +3 -1
- mteb/models/model_implementations/random_baseline.py +16 -21
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +1 -0
- mteb/models/model_implementations/repllama_models.py +2 -0
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +51 -0
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +3 -0
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +57 -0
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +10 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/ua_sentence_models.py +10 -0
- mteb/models/model_implementations/uae_models.py +1 -0
- mteb/models/model_implementations/vdr_models.py +2 -0
- mteb/models/model_implementations/vi_vn_models.py +39 -0
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +2 -0
- mteb/models/model_implementations/voyage_models.py +15 -0
- mteb/models/model_implementations/voyage_v.py +8 -2
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +442 -22
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
- mteb/models/search_wrappers.py +165 -48
- mteb/models/sentence_transformer_wrapper.py +2 -7
- mteb/results/benchmark_results.py +88 -47
- mteb/results/model_result.py +11 -4
- mteb/results/task_result.py +37 -19
- mteb/similarity_functions.py +49 -0
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +1 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +2 -1
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +22 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/_encoder_io.py +7 -2
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import functools
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import warnings
|
|
@@ -15,6 +16,7 @@ from mteb.abstasks.task_metadata import (
|
|
|
15
16
|
TaskDomain,
|
|
16
17
|
TaskType,
|
|
17
18
|
)
|
|
19
|
+
from mteb.benchmarks.benchmark import Benchmark
|
|
18
20
|
from mteb.models import ModelMeta
|
|
19
21
|
from mteb.models.get_model_meta import get_model_metas
|
|
20
22
|
from mteb.types import (
|
|
@@ -31,6 +33,24 @@ from .model_result import ModelResult, _aggregate_and_pivot
|
|
|
31
33
|
logger = logging.getLogger(__name__)
|
|
32
34
|
|
|
33
35
|
|
|
36
|
+
# Global cache for model metas and version parsing
|
|
37
|
+
@functools.lru_cache
|
|
38
|
+
def _get_cached_model_metas() -> dict[str, str | None]:
|
|
39
|
+
"""Cache model metas to avoid repeated calls."""
|
|
40
|
+
return {meta.name: meta.revision for meta in get_model_metas()}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@functools.lru_cache(maxsize=10000)
|
|
44
|
+
def _parse_version_cached(version_str: str | None) -> Version | None:
|
|
45
|
+
"""Cache version parsing to avoid repeated parsing."""
|
|
46
|
+
if version_str is None:
|
|
47
|
+
return None
|
|
48
|
+
try:
|
|
49
|
+
return Version(version_str)
|
|
50
|
+
except (InvalidVersion, TypeError):
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
34
54
|
class BenchmarkResults(BaseModel):
|
|
35
55
|
"""Data class to hold the benchmark results of a model.
|
|
36
56
|
|
|
@@ -39,10 +59,10 @@ class BenchmarkResults(BaseModel):
|
|
|
39
59
|
"""
|
|
40
60
|
|
|
41
61
|
model_results: list[ModelResult]
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
62
|
+
benchmark: Benchmark | None = None
|
|
63
|
+
model_config = ConfigDict(
|
|
64
|
+
protected_namespaces=(), # to free up the name model_results which is otherwise protected
|
|
65
|
+
arbitrary_types_allowed=True, # Benchmark is dataclasses.dataclass
|
|
46
66
|
)
|
|
47
67
|
|
|
48
68
|
def __repr__(self) -> str:
|
|
@@ -173,40 +193,6 @@ class BenchmarkResults(BaseModel):
|
|
|
173
193
|
Returns:
|
|
174
194
|
A new BenchmarkResults object with the revisions joined.
|
|
175
195
|
"""
|
|
176
|
-
|
|
177
|
-
def parse_version(version_str: str) -> Version | None:
|
|
178
|
-
try:
|
|
179
|
-
return Version(version_str)
|
|
180
|
-
except (InvalidVersion, TypeError):
|
|
181
|
-
return None
|
|
182
|
-
|
|
183
|
-
def keep_best(group: pd.DataFrame) -> pd.DataFrame:
|
|
184
|
-
# Filtering out task_results where no scores are present
|
|
185
|
-
group = group[group["has_scores"]]
|
|
186
|
-
is_main_revision = group["revision"] == group["main_revision"]
|
|
187
|
-
# If the main revision is present we select that
|
|
188
|
-
if is_main_revision.sum() > 0:
|
|
189
|
-
return group[is_main_revision].head(n=1)
|
|
190
|
-
unique_revisions = group["revision"].unique()
|
|
191
|
-
|
|
192
|
-
# ensure None/NA/"external" revisions is filtered out
|
|
193
|
-
group.loc[group["revision"].isna(), "revision"] = "no_revision_available"
|
|
194
|
-
group.loc[group["revision"] == "external", "revision"] = (
|
|
195
|
-
"no_revision_available"
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
# Filtering out no_revision_available if other revisions are present
|
|
199
|
-
if (len(unique_revisions) > 1) and (
|
|
200
|
-
"no_revision_available" in unique_revisions
|
|
201
|
-
):
|
|
202
|
-
group = group[group["revision"] != "no_revision_available"]
|
|
203
|
-
# If there are any not-NA mteb versions, we select the latest one
|
|
204
|
-
if group["mteb_version"].notna().any():
|
|
205
|
-
group = group.dropna(subset=["mteb_version"])
|
|
206
|
-
group = group.sort_values("mteb_version", ascending=False)
|
|
207
|
-
return group.head(n=1)
|
|
208
|
-
return group.head(n=1)
|
|
209
|
-
|
|
210
196
|
records = []
|
|
211
197
|
for model_result in self:
|
|
212
198
|
for task_result in model_result.task_results:
|
|
@@ -223,17 +209,54 @@ class BenchmarkResults(BaseModel):
|
|
|
223
209
|
if not records:
|
|
224
210
|
return BenchmarkResults.model_construct(model_results=[])
|
|
225
211
|
task_df = pd.DataFrame.from_records(records)
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
212
|
+
|
|
213
|
+
# Use cached model metas
|
|
214
|
+
model_to_main_revision = _get_cached_model_metas()
|
|
229
215
|
task_df["main_revision"] = task_df["model"].map(model_to_main_revision) # type: ignore
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
216
|
+
|
|
217
|
+
# Use cached version parsing
|
|
218
|
+
task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached) # type: ignore
|
|
219
|
+
|
|
220
|
+
# Filter out rows without scores first
|
|
221
|
+
task_df = task_df[task_df["has_scores"]]
|
|
222
|
+
|
|
223
|
+
# Optimize groupby with vectorized operations
|
|
224
|
+
# Sort by priority: main_revision match, then mteb_version (descending), then revision
|
|
225
|
+
task_df["is_main_revision"] = task_df["revision"] == task_df["main_revision"]
|
|
226
|
+
|
|
227
|
+
# Handle None/NA/external revisions
|
|
228
|
+
task_df["revision_clean"] = task_df["revision"].copy()
|
|
229
|
+
task_df.loc[task_df["revision"].isna(), "revision_clean"] = (
|
|
230
|
+
"no_revision_available"
|
|
235
231
|
)
|
|
232
|
+
task_df.loc[task_df["revision"] == "external", "revision_clean"] = (
|
|
233
|
+
"no_revision_available"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Create a priority column for sorting
|
|
237
|
+
# Higher priority = better to keep
|
|
238
|
+
# Priority: main_revision (1000), has valid mteb_version (100), has valid revision (10)
|
|
239
|
+
task_df["priority"] = 0
|
|
240
|
+
task_df.loc[task_df["is_main_revision"], "priority"] += 1000
|
|
241
|
+
task_df.loc[task_df["mteb_version"].notna(), "priority"] += 100
|
|
242
|
+
task_df.loc[
|
|
243
|
+
task_df["revision_clean"] != "no_revision_available", "priority"
|
|
244
|
+
] += 10
|
|
245
|
+
|
|
246
|
+
# Sort by priority (desc), mteb_version (desc), and take first per group
|
|
247
|
+
task_df = task_df.sort_values(
|
|
248
|
+
["model", "task_name", "priority", "mteb_version"],
|
|
249
|
+
ascending=[True, True, False, False],
|
|
250
|
+
na_position="last",
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
task_df = task_df.groupby(["model", "task_name"], as_index=False).first()
|
|
254
|
+
|
|
255
|
+
# Reconstruct model results
|
|
236
256
|
model_results = []
|
|
257
|
+
# Group by original revision to maintain deterministic behavior
|
|
258
|
+
# After the first() selection above, each (model, task_name) is unique,
|
|
259
|
+
# so grouping by original revision ensures consistent ModelResult creation
|
|
237
260
|
for (model, model_revision), group in task_df.groupby(["model", "revision"]):
|
|
238
261
|
model_result = ModelResult.model_construct(
|
|
239
262
|
model_name=model,
|
|
@@ -296,7 +319,7 @@ class BenchmarkResults(BaseModel):
|
|
|
296
319
|
|
|
297
320
|
def to_dataframe(
|
|
298
321
|
self,
|
|
299
|
-
aggregation_level: Literal["subset", "split", "task"] = "task",
|
|
322
|
+
aggregation_level: Literal["subset", "split", "task", "language"] = "task",
|
|
300
323
|
aggregation_fn: Callable[[list[Score]], Any] | None = None,
|
|
301
324
|
include_model_revision: bool = False,
|
|
302
325
|
format: Literal["wide", "long"] = "wide",
|
|
@@ -321,6 +344,7 @@ class BenchmarkResults(BaseModel):
|
|
|
321
344
|
- "subset"/None: No aggregation will be done. The DataFrame will have one row per model, task, split and subset.
|
|
322
345
|
- "split": Aggregates the scores by split. The DataFrame will have one row per model, task and split.
|
|
323
346
|
- "task": Aggregates the scores by task. The DataFrame will have one row per model and task.
|
|
347
|
+
- "language": Aggregates the scores by language. The DataFrame will have one row per model and language.
|
|
324
348
|
aggregation_fn: The function to use for aggregation. If None, the mean will be used.
|
|
325
349
|
include_model_revision: If True, the model revision will be included in the DataFrame. If False, it will be excluded.
|
|
326
350
|
If there are multiple revisions for the same model, they will be joined using the `join_revisions` method.
|
|
@@ -361,6 +385,23 @@ class BenchmarkResults(BaseModel):
|
|
|
361
385
|
format=format,
|
|
362
386
|
)
|
|
363
387
|
|
|
388
|
+
def get_benchmark_result(self) -> pd.DataFrame:
|
|
389
|
+
"""Get aggregated scores for each model in the benchmark.
|
|
390
|
+
|
|
391
|
+
Uses the benchmark's summary table creation method to compute scores.
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
A DataFrame with the aggregated benchmark scores for each model.
|
|
395
|
+
"""
|
|
396
|
+
if self.benchmark is None:
|
|
397
|
+
raise ValueError(
|
|
398
|
+
"No benchmark associated with these results (self.benchmark is None). "
|
|
399
|
+
"To get benchmark results, load results with a Benchmark object. "
|
|
400
|
+
"`results = cache.load_results(tasks='MTEB(eng, v2)')`"
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
return self.benchmark._create_summary_table(self)
|
|
404
|
+
|
|
364
405
|
def __iter__(self) -> Iterator[ModelResult]:
|
|
365
406
|
return iter(self.model_results)
|
|
366
407
|
|
mteb/results/model_result.py
CHANGED
|
@@ -22,7 +22,7 @@ from mteb.types import (
|
|
|
22
22
|
SplitName,
|
|
23
23
|
)
|
|
24
24
|
|
|
25
|
-
from .task_result import TaskResult
|
|
25
|
+
from .task_result import TaskError, TaskResult
|
|
26
26
|
|
|
27
27
|
logger = logging.getLogger(__name__)
|
|
28
28
|
|
|
@@ -30,7 +30,7 @@ logger = logging.getLogger(__name__)
|
|
|
30
30
|
def _aggregate_and_pivot(
|
|
31
31
|
df: pd.DataFrame,
|
|
32
32
|
columns: list[str],
|
|
33
|
-
aggregation_level: Literal["subset", "split", "task"],
|
|
33
|
+
aggregation_level: Literal["subset", "split", "task", "language"],
|
|
34
34
|
format: Literal["wide", "long"],
|
|
35
35
|
aggregation_fn: Callable[[list[Score]], Any] | None,
|
|
36
36
|
) -> pd.DataFrame:
|
|
@@ -43,6 +43,12 @@ def _aggregate_and_pivot(
|
|
|
43
43
|
elif aggregation_level == "task":
|
|
44
44
|
index_columns = ["task_name"]
|
|
45
45
|
|
|
46
|
+
elif aggregation_level == "language":
|
|
47
|
+
index_columns = ["language"]
|
|
48
|
+
df = df.explode("language").reset_index(
|
|
49
|
+
drop=True
|
|
50
|
+
) # each language in its own row before aggregation
|
|
51
|
+
|
|
46
52
|
# perform aggregation
|
|
47
53
|
if aggregation_fn is None:
|
|
48
54
|
aggregation_fn = np.mean
|
|
@@ -82,6 +88,7 @@ class ModelResult(BaseModel):
|
|
|
82
88
|
protected_namespaces=(),
|
|
83
89
|
)
|
|
84
90
|
)
|
|
91
|
+
exceptions: list[TaskError] | None = None
|
|
85
92
|
|
|
86
93
|
def __repr__(self) -> str:
|
|
87
94
|
n_entries = len(self.task_results)
|
|
@@ -226,7 +233,7 @@ class ModelResult(BaseModel):
|
|
|
226
233
|
)
|
|
227
234
|
return entries
|
|
228
235
|
|
|
229
|
-
def _get_score_for_table(self) -> list[dict[str, str | float]]:
|
|
236
|
+
def _get_score_for_table(self) -> list[dict[str, str | float | list[str]]]:
|
|
230
237
|
scores_data = []
|
|
231
238
|
model_name = self.model_name
|
|
232
239
|
for task_result in self.task_results:
|
|
@@ -238,10 +245,10 @@ class ModelResult(BaseModel):
|
|
|
238
245
|
"model_revision": self.model_revision,
|
|
239
246
|
"task_name": task_name,
|
|
240
247
|
"split": split,
|
|
248
|
+
"language": score_item.get("languages", ["Unknown"]),
|
|
241
249
|
"subset": score_item.get("hf_subset", "default"),
|
|
242
250
|
"score": score_item.get("main_score", None),
|
|
243
251
|
}
|
|
244
|
-
|
|
245
252
|
scores_data.append(row)
|
|
246
253
|
|
|
247
254
|
return scores_data
|
mteb/results/task_result.py
CHANGED
|
@@ -633,21 +633,23 @@ class TaskResult(BaseModel):
|
|
|
633
633
|
task = get_task(self.task_name)
|
|
634
634
|
|
|
635
635
|
splits = task.eval_splits
|
|
636
|
-
hf_subsets = task.hf_subsets
|
|
637
|
-
hf_subsets = set(hf_subsets)
|
|
636
|
+
hf_subsets = set(task.hf_subsets) # Convert to set once
|
|
638
637
|
|
|
639
638
|
new_scores = {}
|
|
640
639
|
seen_splits = set()
|
|
641
640
|
for split in self.scores:
|
|
642
641
|
if split not in splits:
|
|
643
642
|
continue
|
|
644
|
-
new_scores[split] = []
|
|
645
643
|
seen_subsets = set()
|
|
646
|
-
for
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
644
|
+
# Use list comprehension for better performance
|
|
645
|
+
new_scores[split] = [
|
|
646
|
+
_scores
|
|
647
|
+
for _scores in self.scores[split]
|
|
648
|
+
if _scores["hf_subset"] in hf_subsets
|
|
649
|
+
]
|
|
650
|
+
for _scores in new_scores[split]:
|
|
650
651
|
seen_subsets.add(_scores["hf_subset"])
|
|
652
|
+
|
|
651
653
|
if seen_subsets != hf_subsets:
|
|
652
654
|
missing_subsets = hf_subsets - seen_subsets
|
|
653
655
|
if len(missing_subsets) > 2:
|
|
@@ -664,9 +666,9 @@ class TaskResult(BaseModel):
|
|
|
664
666
|
logger.warning(
|
|
665
667
|
f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
|
|
666
668
|
)
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
return
|
|
669
|
+
data = self.model_dump()
|
|
670
|
+
data["scores"] = new_scores
|
|
671
|
+
return type(self).model_construct(**data)
|
|
670
672
|
|
|
671
673
|
def is_mergeable(
|
|
672
674
|
self,
|
|
@@ -698,27 +700,31 @@ class TaskResult(BaseModel):
|
|
|
698
700
|
name = result.metadata.name
|
|
699
701
|
revision = result.metadata.revision
|
|
700
702
|
else:
|
|
703
|
+
msg = "result must be a TaskResult or AbsTask object"
|
|
704
|
+
if raise_error:
|
|
705
|
+
raise ValueError(msg)
|
|
706
|
+
logger.debug(msg)
|
|
701
707
|
return False
|
|
702
708
|
|
|
703
709
|
if self.task_name != name:
|
|
710
|
+
msg = f"Cannot merge TaskResult objects as they are derived from different tasks ({self.task_name} and {name})"
|
|
704
711
|
if raise_error:
|
|
705
|
-
raise ValueError(
|
|
706
|
-
|
|
707
|
-
)
|
|
712
|
+
raise ValueError(msg)
|
|
713
|
+
logger.debug(msg)
|
|
708
714
|
return False
|
|
709
715
|
|
|
710
716
|
if Criteria.MTEB_VERSION in criteria and self.mteb_version != mteb_version:
|
|
717
|
+
msg = f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} (loaded) and {mteb_version} (current))"
|
|
711
718
|
if raise_error:
|
|
712
|
-
raise ValueError(
|
|
713
|
-
|
|
714
|
-
)
|
|
719
|
+
raise ValueError(msg)
|
|
720
|
+
logger.debug(msg)
|
|
715
721
|
return False
|
|
716
722
|
|
|
717
723
|
if Criteria.DATASET_REVISION in criteria and self.dataset_revision != revision:
|
|
724
|
+
msg = f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
|
|
718
725
|
if raise_error:
|
|
719
|
-
raise ValueError(
|
|
720
|
-
|
|
721
|
-
)
|
|
726
|
+
raise ValueError(msg)
|
|
727
|
+
logger.debug(msg)
|
|
722
728
|
return False
|
|
723
729
|
|
|
724
730
|
return True
|
|
@@ -836,3 +842,15 @@ class TaskResult(BaseModel):
|
|
|
836
842
|
)
|
|
837
843
|
)
|
|
838
844
|
return results
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
class TaskError(BaseModel):
|
|
848
|
+
"""A class to represent an error that occurred during the evaluation of a task.
|
|
849
|
+
|
|
850
|
+
Attributes:
|
|
851
|
+
task_name: The name of the MTEB task.
|
|
852
|
+
exception: The error message that occurred during the evaluation.
|
|
853
|
+
"""
|
|
854
|
+
|
|
855
|
+
task_name: str
|
|
856
|
+
exception: str
|
mteb/similarity_functions.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import torch
|
|
2
2
|
|
|
3
3
|
from mteb.models import EncoderProtocol
|
|
4
|
+
from mteb.models.model_meta import ScoringFunction
|
|
4
5
|
from mteb.types import Array
|
|
5
6
|
|
|
6
7
|
|
|
@@ -38,6 +39,54 @@ def compute_pairwise_similarity(
|
|
|
38
39
|
return pairwise_cos_sim(embedding1, embedding2)
|
|
39
40
|
|
|
40
41
|
|
|
42
|
+
def select_similarity(
|
|
43
|
+
embedding1: Array,
|
|
44
|
+
embedding2: Array,
|
|
45
|
+
similarity_fn: ScoringFunction,
|
|
46
|
+
) -> Array:
|
|
47
|
+
"""Compute similarity between two sets of embeddings using the specified similarity function.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
embedding1: The first set of embeddings.
|
|
51
|
+
embedding2: The second set of embeddings.
|
|
52
|
+
similarity_fn: The similarity function to use (COSINE, DOT_PRODUCT, EUCLIDEAN).
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Array: The computed similarity scores.
|
|
56
|
+
"""
|
|
57
|
+
if similarity_fn is ScoringFunction.COSINE:
|
|
58
|
+
return cos_sim(embedding1, embedding2)
|
|
59
|
+
elif similarity_fn is ScoringFunction.DOT_PRODUCT:
|
|
60
|
+
return dot_score(embedding1, embedding2)
|
|
61
|
+
elif similarity_fn is ScoringFunction.EUCLIDEAN:
|
|
62
|
+
return euclidean_sim(embedding1, embedding2)
|
|
63
|
+
raise ValueError(f"Unsupported similarity function: {similarity_fn}")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def select_pairwise_similarity(
|
|
67
|
+
embedding1: Array,
|
|
68
|
+
embedding2: Array,
|
|
69
|
+
similarity_fn: ScoringFunction,
|
|
70
|
+
) -> Array:
|
|
71
|
+
"""Compute pairwise similarity between two sets of embeddings using the specified similarity function.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
embedding1: The first set of embeddings.
|
|
75
|
+
embedding2: The second set of embeddings.
|
|
76
|
+
similarity_fn: The similarity function to use (COSINE, DOT_PRODUCT, EUCLIDEAN).
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Array: The computed pairwise similarity scores.
|
|
80
|
+
"""
|
|
81
|
+
if similarity_fn is ScoringFunction.COSINE:
|
|
82
|
+
return pairwise_cos_sim(embedding1, embedding2)
|
|
83
|
+
elif similarity_fn is ScoringFunction.DOT_PRODUCT:
|
|
84
|
+
return pairwise_dot_score(embedding1, embedding2)
|
|
85
|
+
elif similarity_fn is ScoringFunction.EUCLIDEAN:
|
|
86
|
+
return pairwise_euclidean_sim(embedding1, embedding2)
|
|
87
|
+
raise ValueError(f"Unsupported similarity function: {similarity_fn}")
|
|
88
|
+
|
|
89
|
+
|
|
41
90
|
def _normalize_embeddings(embeddings: Array) -> torch.Tensor:
|
|
42
91
|
"""Normalizes the embeddings matrix, so that each sentence embedding has unit length.
|
|
43
92
|
|
|
@@ -16,7 +16,7 @@ from .nusa_translation_bitext_mining import NusaTranslationBitextMining
|
|
|
16
16
|
from .nusa_x_bitext_mining import NusaXBitextMining
|
|
17
17
|
from .phinc_bitext_mining import PhincBitextMining
|
|
18
18
|
from .roma_tales_bitext_mining import RomaTalesBitextMining
|
|
19
|
-
from .ru_sci_bench_bitext_mining import RuSciBenchBitextMining
|
|
19
|
+
from .ru_sci_bench_bitext_mining import RuSciBenchBitextMining, RuSciBenchBitextMiningV2
|
|
20
20
|
from .tatoeba_bitext_mining import TatoebaBitextMining
|
|
21
21
|
from .web_faq_bitext_mining import WebFAQBitextMiningQAs, WebFAQBitextMiningQuestions
|
|
22
22
|
|
|
@@ -40,6 +40,7 @@ __all__ = [
|
|
|
40
40
|
"PhincBitextMining",
|
|
41
41
|
"RomaTalesBitextMining",
|
|
42
42
|
"RuSciBenchBitextMining",
|
|
43
|
+
"RuSciBenchBitextMiningV2",
|
|
43
44
|
"TatoebaBitextMining",
|
|
44
45
|
"WebFAQBitextMiningQAs",
|
|
45
46
|
"WebFAQBitextMiningQuestions",
|
|
@@ -23,7 +23,7 @@ class BUCCBitextMining(AbsTaskBitextMining):
|
|
|
23
23
|
"path": "mteb/BUCC",
|
|
24
24
|
"revision": "414572247440f0ccacf7eb0bb70a31533a0e5443",
|
|
25
25
|
},
|
|
26
|
-
description="BUCC bitext mining dataset",
|
|
26
|
+
description="BUCC bitext mining dataset train split.",
|
|
27
27
|
reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
|
|
28
28
|
type="BitextMining",
|
|
29
29
|
category="t2t",
|
|
@@ -71,7 +71,9 @@ Rapp, Reinhard},
|
|
|
71
71
|
|
|
72
72
|
sentence1 = data["sentence1"][0]
|
|
73
73
|
sentence2 = data["sentence2"][0]
|
|
74
|
-
sentence1 = [
|
|
74
|
+
sentence1 = [
|
|
75
|
+
sentence1[i] for (i, j) in gold
|
|
76
|
+
] # keep only sentences in gold. The 2nd value is meant for sentence2 but not used here. This is fixed in BUCC.v2.
|
|
75
77
|
logger.info(f"Lang {lang} num gold {len(gold)}")
|
|
76
78
|
logger.info(f"Lang {lang} num sentence1 {len(sentence1)}")
|
|
77
79
|
logger.info(f"Lang {lang} num sentence2 {len(sentence2)}")
|
|
@@ -20,7 +20,7 @@ class BUCCBitextMiningFast(AbsTaskBitextMining):
|
|
|
20
20
|
"path": "mteb/bucc-bitext-mining",
|
|
21
21
|
"revision": "1739dc11ffe9b7bfccd7f3d585aeb4c544fc6677",
|
|
22
22
|
},
|
|
23
|
-
description="BUCC bitext mining dataset",
|
|
23
|
+
description="BUCC bitext mining dataset train split, gold set only.",
|
|
24
24
|
reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
|
|
25
25
|
type="BitextMining",
|
|
26
26
|
category="t2t",
|
|
@@ -10,11 +10,53 @@ class RuSciBenchBitextMining(AbsTaskBitextMining):
|
|
|
10
10
|
"path": "mlsa-iai-msu-lab/ru_sci_bench_bitext_mining",
|
|
11
11
|
"revision": "e5840033c5cf2573932db027ac8001fe0a7eb6fa",
|
|
12
12
|
},
|
|
13
|
-
description="
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
13
|
+
description="This task focuses on finding translations of scientific articles. The dataset is sourced from eLibrary, Russia's largest electronic library of scientific publications. Russian authors often provide English translations for their abstracts and titles, and the data consists of these paired titles and abstracts. The task evaluates a model's ability to match an article's Russian title and abstract to its English counterpart, or vice versa.",
|
|
14
|
+
reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb",
|
|
15
|
+
type="BitextMining",
|
|
16
|
+
category="t2c",
|
|
17
|
+
modalities=["text"],
|
|
18
|
+
eval_splits=["test"],
|
|
19
|
+
eval_langs={
|
|
20
|
+
"ru-en": ["rus-Cyrl", "eng-Latn"],
|
|
21
|
+
"en-ru": ["eng-Latn", "rus-Cyrl"],
|
|
22
|
+
},
|
|
23
|
+
main_score="f1",
|
|
24
|
+
date=("2007-01-01", "2023-01-01"),
|
|
25
|
+
domains=["Academic", "Non-fiction", "Written"],
|
|
26
|
+
task_subtypes=[],
|
|
27
|
+
license="not specified",
|
|
28
|
+
dialect=[],
|
|
29
|
+
sample_creation="found",
|
|
30
|
+
annotations_creators="derived",
|
|
31
|
+
bibtex_citation=r"""
|
|
32
|
+
@article{vatolin2024ruscibench,
|
|
33
|
+
author = {Vatolin, A. and Gerasimenko, N. and Ianina, A. and Vorontsov, K.},
|
|
34
|
+
doi = {10.1134/S1064562424602191},
|
|
35
|
+
issn = {1531-8362},
|
|
36
|
+
journal = {Doklady Mathematics},
|
|
37
|
+
month = {12},
|
|
38
|
+
number = {1},
|
|
39
|
+
pages = {S251--S260},
|
|
40
|
+
title = {RuSciBench: Open Benchmark for Russian and English Scientific Document Representations},
|
|
41
|
+
url = {https://doi.org/10.1134/S1064562424602191},
|
|
42
|
+
volume = {110},
|
|
43
|
+
year = {2024},
|
|
44
|
+
}
|
|
45
|
+
""",
|
|
46
|
+
prompt="Given the following title and abstract of the scientific article, find its translation",
|
|
47
|
+
superseded_by="RuSciBenchBitextMining.v2",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class RuSciBenchBitextMiningV2(AbsTaskBitextMining):
|
|
52
|
+
fast_loading = True
|
|
53
|
+
metadata = TaskMetadata(
|
|
54
|
+
name="RuSciBenchBitextMining.v2",
|
|
55
|
+
dataset={
|
|
56
|
+
"path": "mlsa-iai-msu-lab/ru_sci_bench_bitext_mining",
|
|
57
|
+
"revision": "20e815e8ac8787331546386dfd177821510f79a3",
|
|
58
|
+
},
|
|
59
|
+
description="This task focuses on finding translations of scientific articles. The dataset is sourced from eLibrary, Russia's largest electronic library of scientific publications. Russian authors often provide English translations for their abstracts and titles, and the data consists of these paired titles and abstracts. The task evaluates a model's ability to match an article's Russian title and abstract to its English counterpart, or vice versa. Compared to the previous version, 6 erroneous examples have been removed.",
|
|
18
60
|
reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb",
|
|
19
61
|
type="BitextMining",
|
|
20
62
|
category="t2c",
|
|
@@ -198,9 +198,7 @@ _SPLITS = ["default"]
|
|
|
198
198
|
class WebFAQBitextMiningQuestions(AbsTaskBitextMining):
|
|
199
199
|
metadata = TaskMetadata(
|
|
200
200
|
name="WebFAQBitextMiningQuestions",
|
|
201
|
-
description=
|
|
202
|
-
A sentence in the "WebFAQBitextMiningQuestions" task is the question originating from an aligned QA.
|
|
203
|
-
The dataset is sourced from FAQ pages on the web.""",
|
|
201
|
+
description='The WebFAQ Bitext Dataset consists of natural FAQ-style Question-Answer pairs that align across languages. A sentence in the "WebFAQBitextMiningQuestions" task is the question originating from an aligned QA. The dataset is sourced from FAQ pages on the web.',
|
|
204
202
|
reference="https://huggingface.co/PaDaS-Lab",
|
|
205
203
|
dataset={
|
|
206
204
|
"path": "PaDaS-Lab/webfaq-bitexts",
|
|
@@ -254,9 +252,7 @@ The dataset is sourced from FAQ pages on the web.""",
|
|
|
254
252
|
class WebFAQBitextMiningQAs(AbsTaskBitextMining):
|
|
255
253
|
metadata = TaskMetadata(
|
|
256
254
|
name="WebFAQBitextMiningQAs",
|
|
257
|
-
description=
|
|
258
|
-
A sentence in the "WebFAQBitextMiningQAs" task is a concatenation of a question and its corresponding answer.
|
|
259
|
-
The dataset is sourced from FAQ pages on the web.""",
|
|
255
|
+
description='The WebFAQ Bitext Dataset consists of natural FAQ-style Question-Answer pairs that align across languages. A sentence in the "WebFAQBitextMiningQAs" task is a concatenation of a question and its corresponding answer. The dataset is sourced from FAQ pages on the web.',
|
|
260
256
|
reference="https://huggingface.co/PaDaS-Lab",
|
|
261
257
|
dataset={
|
|
262
258
|
"path": "PaDaS-Lab/webfaq-bitexts",
|
|
@@ -45,8 +45,7 @@ class AJGTV2(AbsTaskClassification):
|
|
|
45
45
|
"path": "mteb/ajgt",
|
|
46
46
|
"revision": "0a3dea7301ee0c051891f04d32f3e8577a9eae36",
|
|
47
47
|
},
|
|
48
|
-
description="
|
|
49
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
|
|
48
|
+
description="Arabic Jordanian General Tweets (AJGT) Corpus consisted of 1,800 tweets (900 for training and 900 for testing) annotated as positive and negative. Modern Standard Arabic (MSA) or Jordanian dialect. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
|
|
50
49
|
reference="https://link.springer.com/chapter/10.1007/978-3-319-60042-0_66/",
|
|
51
50
|
type="Classification",
|
|
52
51
|
category="t2c",
|
|
@@ -45,8 +45,7 @@ class HotelReviewSentimentClassificationV2(AbsTaskClassification):
|
|
|
45
45
|
"path": "mteb/HotelReviewSentimentClassification",
|
|
46
46
|
"revision": "f5e6a24acbed4182114ffdf46747090b3f51e836",
|
|
47
47
|
},
|
|
48
|
-
description="
|
|
49
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
|
|
48
|
+
description="HARD is a dataset of Arabic hotel reviews collected from the Booking.com website. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
|
|
50
49
|
reference="https://link.springer.com/chapter/10.1007/978-3-319-67056-0_3",
|
|
51
50
|
type="Classification",
|
|
52
51
|
category="t2c",
|
|
@@ -41,8 +41,7 @@ class OnlineStoreReviewSentimentClassificationV2(AbsTaskClassification):
|
|
|
41
41
|
"path": "mteb/online_store_review_sentiment",
|
|
42
42
|
"revision": "de0e8eed65adf1cbc58f8743a5f5c5df556de4c4",
|
|
43
43
|
},
|
|
44
|
-
description="
|
|
45
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
|
|
44
|
+
description="This dataset contains Arabic reviews of products from the SHEIN online store. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
|
|
46
45
|
reference="https://huggingface.co/datasets/Ruqiya/Arabic_Reviews_of_SHEIN",
|
|
47
46
|
type="Classification",
|
|
48
47
|
category="t2c",
|
|
@@ -52,8 +52,7 @@ class RestaurantReviewSentimentClassificationV2(AbsTaskClassification):
|
|
|
52
52
|
"path": "mteb/restaurant_review_sentiment",
|
|
53
53
|
"revision": "5d28c1e8fb393173a849696ed178b90a6f78754a",
|
|
54
54
|
},
|
|
55
|
-
description="
|
|
56
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
|
|
55
|
+
description="Dataset of 8156 restaurant reviews from qaym.com in Arabic for sentiment analysis This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
|
|
57
56
|
reference="https://link.springer.com/chapter/10.1007/978-3-319-18117-2_2",
|
|
58
57
|
type="Classification",
|
|
59
58
|
category="t2c",
|
|
@@ -45,8 +45,7 @@ class TweetEmotionClassificationV2(AbsTaskClassification):
|
|
|
45
45
|
"path": "mteb/TweetEmotionClassification",
|
|
46
46
|
"revision": "930d65840c089406ceed5241b1a9ba7294e5eeae",
|
|
47
47
|
},
|
|
48
|
-
description="
|
|
49
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
|
|
48
|
+
description="A dataset of 10,012 tweets that was created with the aim of covering the most frequently used emotion categories in Arabic tweets. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
|
|
50
49
|
reference="https://link.springer.com/chapter/10.1007/978-3-319-77116-8_8",
|
|
51
50
|
type="Classification",
|
|
52
51
|
category="t2c",
|
|
@@ -62,8 +62,7 @@ class TweetSarcasmClassificationV2(AbsTaskClassification):
|
|
|
62
62
|
"path": "mteb/tweet_sarcasm",
|
|
63
63
|
"revision": "3a20898e2ea3303844e907d55f7a815a7644150d",
|
|
64
64
|
},
|
|
65
|
-
description="
|
|
66
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
|
|
65
|
+
description="Arabic sarcasm detection dataset, which was created through the reannotation of available Arabic sentiment analysis datasets. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
|
|
67
66
|
reference="https://aclanthology.org/2020.osact-1.5/",
|
|
68
67
|
type="Classification",
|
|
69
68
|
category="t2c",
|
|
@@ -55,8 +55,7 @@ Islam, Tanvir},
|
|
|
55
55
|
class BengaliDocumentClassificationV2(AbsTaskClassification):
|
|
56
56
|
metadata = TaskMetadata(
|
|
57
57
|
name="BengaliDocumentClassification.v2",
|
|
58
|
-
description="
|
|
59
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
|
|
58
|
+
description="Dataset for News Classification, categorized with 13 domains. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
|
|
60
59
|
reference="https://aclanthology.org/2023.eacl-main.4",
|
|
61
60
|
dataset={
|
|
62
61
|
"path": "mteb/bengali_document",
|
|
@@ -45,8 +45,7 @@ class BengaliHateSpeechClassification(AbsTaskClassification):
|
|
|
45
45
|
class BengaliHateSpeechClassificationV2(AbsTaskClassification):
|
|
46
46
|
metadata = TaskMetadata(
|
|
47
47
|
name="BengaliHateSpeechClassification.v2",
|
|
48
|
-
description="
|
|
49
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
|
|
48
|
+
description="The Bengali Hate Speech Dataset is a Bengali-language dataset of news articles collected from various Bengali media sources and categorized based on the type of hate in the text. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
|
|
50
49
|
reference="https://huggingface.co/datasets/bn_hate_speech",
|
|
51
50
|
dataset={
|
|
52
51
|
"path": "mteb/bengali_hate_speech",
|