mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +6 -0
- mteb/_create_dataloaders.py +22 -20
- mteb/_evaluators/any_sts_evaluator.py +23 -14
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +3 -3
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
- mteb/_evaluators/pair_classification_evaluator.py +34 -40
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +25 -37
- mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
- mteb/_evaluators/text/summarization_evaluator.py +27 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +105 -0
- mteb/abstasks/_statistics_calculation.py +23 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -12
- mteb/abstasks/clustering.py +20 -16
- mteb/abstasks/clustering_legacy.py +13 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +33 -22
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +14 -4
- mteb/abstasks/task_metadata.py +32 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +77 -16
- mteb/benchmarks/benchmarks/__init__.py +12 -0
- mteb/benchmarks/benchmarks/benchmarks.py +361 -16
- mteb/benchmarks/get_benchmark.py +14 -53
- mteb/cache.py +227 -37
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +71 -62
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +106 -75
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +414 -151
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/load_results.py +12 -12
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +31 -23
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +3 -3
- mteb/models/get_model_meta.py +25 -118
- mteb/models/instruct_wrapper.py +33 -9
- mteb/models/model_implementations/align_models.py +8 -1
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +9 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +101 -17
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +13 -2
- mteb/models/model_implementations/blip_models.py +43 -16
- mteb/models/model_implementations/bm25.py +5 -4
- mteb/models/model_implementations/bmretriever_models.py +10 -4
- mteb/models/model_implementations/cadet_models.py +10 -1
- mteb/models/model_implementations/cde_models.py +25 -4
- mteb/models/model_implementations/clip_models.py +9 -6
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +165 -3
- mteb/models/model_implementations/codesage_models.py +18 -3
- mteb/models/model_implementations/cohere_models.py +13 -6
- mteb/models/model_implementations/cohere_v.py +7 -2
- mteb/models/model_implementations/colpali_models.py +17 -9
- mteb/models/model_implementations/colqwen_models.py +275 -5
- mteb/models/model_implementations/colsmol_models.py +4 -2
- mteb/models/model_implementations/conan_models.py +2 -1
- mteb/models/model_implementations/dino_models.py +194 -23
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +21 -110
- mteb/models/model_implementations/e5_v.py +7 -6
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +67 -9
- mteb/models/model_implementations/facebookai.py +205 -0
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +17 -10
- mteb/models/model_implementations/google_models.py +17 -6
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
- mteb/models/model_implementations/gritlm_models.py +4 -2
- mteb/models/model_implementations/gte_models.py +99 -9
- mteb/models/model_implementations/hinvec_models.py +2 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +256 -3
- mteb/models/model_implementations/jina_clip.py +49 -10
- mteb/models/model_implementations/jina_models.py +222 -11
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +37 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +4 -3
- mteb/models/model_implementations/listconranker.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +9 -6
- mteb/models/model_implementations/llm2vec_models.py +16 -8
- mteb/models/model_implementations/mcinext_models.py +7 -1
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +422 -60
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +15 -4
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +27 -14
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
- mteb/models/model_implementations/nomic_models.py +173 -6
- mteb/models/model_implementations/nomic_models_vision.py +8 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
- mteb/models/model_implementations/nvidia_models.py +155 -20
- mteb/models/model_implementations/octen_models.py +254 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +37 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
- mteb/models/model_implementations/ops_moa_models.py +5 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +9 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +12 -8
- mteb/models/model_implementations/pylate_models.py +46 -12
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +9 -6
- mteb/models/model_implementations/qzhou_models.py +5 -3
- mteb/models/model_implementations/random_baseline.py +19 -24
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +2 -1
- mteb/models/model_implementations/repllama_models.py +5 -3
- mteb/models/model_implementations/rerankers_custom.py +15 -9
- mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +71 -20
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +6 -3
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +177 -18
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +30 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +11 -1
- mteb/models/model_implementations/uae_models.py +8 -1
- mteb/models/model_implementations/vdr_models.py +3 -1
- mteb/models/model_implementations/vi_vn_models.py +45 -6
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +5 -3
- mteb/models/model_implementations/voyage_models.py +99 -0
- mteb/models/model_implementations/voyage_v.py +17 -9
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +498 -29
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
- mteb/models/search_wrappers.py +197 -65
- mteb/models/sentence_transformer_wrapper.py +52 -32
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +114 -65
- mteb/results/model_result.py +63 -26
- mteb/results/task_result.py +117 -77
- mteb/similarity_functions.py +60 -7
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -3
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +2 -3
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +16 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +24 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +19 -2
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
- mteb/models/model_implementations/mxbai_models.py +0 -102
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
mteb/abstasks/retrieval.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
from collections import defaultdict
|
|
4
|
-
from collections.abc import Callable, Sequence
|
|
4
|
+
from collections.abc import Callable, Mapping, Sequence
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from time import time
|
|
7
7
|
from typing import Any, Literal
|
|
@@ -25,6 +25,7 @@ from mteb.models import (
|
|
|
25
25
|
SearchProtocol,
|
|
26
26
|
)
|
|
27
27
|
from mteb.types import (
|
|
28
|
+
EncodeKwargs,
|
|
28
29
|
HFSubset,
|
|
29
30
|
QueryDatasetType,
|
|
30
31
|
RelevantDocumentsType,
|
|
@@ -184,17 +185,17 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
184
185
|
return queries, corpus
|
|
185
186
|
|
|
186
187
|
if self.metadata.is_multilingual:
|
|
187
|
-
for subset in self.queries:
|
|
188
|
-
for split in self.queries[subset]:
|
|
189
|
-
queries = self.queries[subset][split]
|
|
190
|
-
corpus = self.corpus[subset][split]
|
|
188
|
+
for subset in self.queries: # type: ignore[attr-defined]
|
|
189
|
+
for split in self.queries[subset]: # type: ignore[attr-defined]
|
|
190
|
+
queries = self.queries[subset][split] # type: ignore[attr-defined]
|
|
191
|
+
corpus = self.corpus[subset][split] # type: ignore[attr-defined]
|
|
191
192
|
|
|
192
193
|
(
|
|
193
194
|
self.dataset[subset][split]["queries"],
|
|
194
195
|
self.dataset[subset][split]["corpus"],
|
|
195
196
|
) = _process_split(queries, corpus)
|
|
196
197
|
|
|
197
|
-
self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[
|
|
198
|
+
self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[ # type: ignore[attr-defined]
|
|
198
199
|
subset
|
|
199
200
|
][split]
|
|
200
201
|
if hasattr(self, "instructions"):
|
|
@@ -211,15 +212,15 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
211
212
|
][split]
|
|
212
213
|
else:
|
|
213
214
|
subset = "default"
|
|
214
|
-
for split in self.queries:
|
|
215
|
-
queries = self.queries[split]
|
|
216
|
-
corpus = self.corpus[split]
|
|
215
|
+
for split in self.queries: # type: ignore[attr-defined]
|
|
216
|
+
queries = self.queries[split] # type: ignore[attr-defined]
|
|
217
|
+
corpus = self.corpus[split] # type: ignore[attr-defined]
|
|
217
218
|
(
|
|
218
219
|
self.dataset[subset][split]["queries"],
|
|
219
220
|
self.dataset[subset][split]["corpus"],
|
|
220
221
|
) = _process_split(queries, corpus)
|
|
221
222
|
|
|
222
|
-
self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[
|
|
223
|
+
self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[ # type: ignore[attr-defined]
|
|
223
224
|
split
|
|
224
225
|
].copy()
|
|
225
226
|
if hasattr(self, "instructions"):
|
|
@@ -235,9 +236,9 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
235
236
|
split
|
|
236
237
|
].copy()
|
|
237
238
|
|
|
238
|
-
del self.queries
|
|
239
|
-
del self.corpus
|
|
240
|
-
del self.relevant_docs
|
|
239
|
+
del self.queries # type: ignore[attr-defined]
|
|
240
|
+
del self.corpus # type: ignore[attr-defined]
|
|
241
|
+
del self.relevant_docs # type: ignore[attr-defined]
|
|
241
242
|
if hasattr(self, "instructions"):
|
|
242
243
|
del self.instructions
|
|
243
244
|
if hasattr(self, "top_ranked"):
|
|
@@ -283,10 +284,10 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
283
284
|
split: str = "test",
|
|
284
285
|
subsets_to_run: list[HFSubset] | None = None,
|
|
285
286
|
*,
|
|
286
|
-
encode_kwargs:
|
|
287
|
+
encode_kwargs: EncodeKwargs,
|
|
287
288
|
prediction_folder: Path | None = None,
|
|
288
|
-
**kwargs,
|
|
289
|
-
) ->
|
|
289
|
+
**kwargs: Any,
|
|
290
|
+
) -> Mapping[HFSubset, ScoresDict]:
|
|
290
291
|
"""Evaluate the model on the retrieval task.
|
|
291
292
|
|
|
292
293
|
Args:
|
|
@@ -320,7 +321,7 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
320
321
|
self,
|
|
321
322
|
model: MTEBModels,
|
|
322
323
|
data_split: RetrievalSplitData,
|
|
323
|
-
encode_kwargs:
|
|
324
|
+
encode_kwargs: EncodeKwargs,
|
|
324
325
|
hf_split: str,
|
|
325
326
|
hf_subset: str,
|
|
326
327
|
prediction_folder: Path | None = None,
|
|
@@ -357,6 +358,8 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
357
358
|
**kwargs,
|
|
358
359
|
)
|
|
359
360
|
|
|
361
|
+
search_model: SearchProtocol
|
|
362
|
+
|
|
360
363
|
if isinstance(model, EncoderProtocol) and not isinstance(model, SearchProtocol):
|
|
361
364
|
search_model = SearchEncoderWrapper(model)
|
|
362
365
|
elif isinstance(model, CrossEncoderProtocol):
|
|
@@ -578,11 +581,12 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
578
581
|
if isinstance(data[split][subset_item], Dataset):
|
|
579
582
|
sections[split] = data[split][subset_item]
|
|
580
583
|
elif converter is not None:
|
|
584
|
+
subset_data = data[split][subset_item]
|
|
585
|
+
if subset_data is None:
|
|
586
|
+
continue
|
|
587
|
+
|
|
581
588
|
sections[split] = Dataset.from_list(
|
|
582
|
-
[
|
|
583
|
-
converter(idx, item)
|
|
584
|
-
for idx, item in data[split][subset_item].items()
|
|
585
|
-
]
|
|
589
|
+
[converter(idx, item) for idx, item in subset_data.items()]
|
|
586
590
|
)
|
|
587
591
|
else:
|
|
588
592
|
raise ValueError(
|
|
@@ -680,7 +684,7 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
680
684
|
|
|
681
685
|
top_k_sorted = defaultdict(list)
|
|
682
686
|
for query_id, values in top_ranked.items():
|
|
683
|
-
sorted_keys = sorted(values, key=values
|
|
687
|
+
sorted_keys = sorted(values, key=lambda k: values[k], reverse=True)
|
|
684
688
|
top_k_sorted[query_id] = sorted_keys[: self._top_k]
|
|
685
689
|
|
|
686
690
|
self.dataset[subset][split]["top_ranked"] = top_k_sorted
|
|
@@ -688,10 +692,10 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
688
692
|
|
|
689
693
|
|
|
690
694
|
def _process_relevant_docs(
|
|
691
|
-
collection:
|
|
695
|
+
collection: Mapping[str, Mapping[str, int]],
|
|
692
696
|
hf_subset: str,
|
|
693
697
|
split: str,
|
|
694
|
-
) -> dict[str, dict[str,
|
|
698
|
+
) -> dict[str, dict[str, int]]:
|
|
695
699
|
"""Collections can contain overlapping ids in different splits. Prepend split and subset to avoid this
|
|
696
700
|
|
|
697
701
|
Returns:
|
|
@@ -136,7 +136,7 @@ class RetrievalDatasetLoader:
|
|
|
136
136
|
"_id", "id"
|
|
137
137
|
)
|
|
138
138
|
logger.info("Loaded %d %s Documents.", len(corpus_ds), self.split.upper())
|
|
139
|
-
logger.
|
|
139
|
+
logger.debug("Doc Example: %s", corpus_ds[0])
|
|
140
140
|
return corpus_ds
|
|
141
141
|
|
|
142
142
|
def _load_queries(self) -> QueryDatasetType:
|
|
@@ -152,7 +152,7 @@ class RetrievalDatasetLoader:
|
|
|
152
152
|
)
|
|
153
153
|
|
|
154
154
|
logger.info("Loaded %d %s queries.", len(queries_ds), self.split.upper())
|
|
155
|
-
logger.
|
|
155
|
+
logger.debug("Query Example: %s", queries_ds[0])
|
|
156
156
|
|
|
157
157
|
return queries_ds
|
|
158
158
|
|
mteb/abstasks/sts.py
CHANGED
|
@@ -7,7 +7,8 @@ from scipy.stats import pearsonr, spearmanr
|
|
|
7
7
|
|
|
8
8
|
from mteb._evaluators import AnySTSEvaluator
|
|
9
9
|
from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
|
|
10
|
-
from mteb.models import EncoderProtocol
|
|
10
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
11
|
+
from mteb.types import EncodeKwargs, PromptType
|
|
11
12
|
from mteb.types.statistics import (
|
|
12
13
|
ImageStatistics,
|
|
13
14
|
ScoreStatistics,
|
|
@@ -89,23 +90,30 @@ class AbsTaskSTS(AbsTask):
|
|
|
89
90
|
min_score: Minimum possible score in the dataset.
|
|
90
91
|
max_score: Maximum possible score in the dataset.
|
|
91
92
|
abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
|
|
93
|
+
input1_prompt_type: Type of prompt of first input. Used for asymmetric tasks.
|
|
94
|
+
input2_prompt_type: Type of prompt of second input. Used for asymmetric tasks.
|
|
92
95
|
"""
|
|
93
96
|
|
|
94
97
|
abstask_prompt = "Retrieve semantically similar text."
|
|
95
98
|
column_names: tuple[str, str] = ("sentence1", "sentence2")
|
|
96
99
|
min_score: int = 0
|
|
97
100
|
max_score: int = 5
|
|
101
|
+
input1_prompt_type: PromptType | None = None
|
|
102
|
+
input2_prompt_type: PromptType | None = None
|
|
98
103
|
|
|
99
104
|
def _evaluate_subset(
|
|
100
105
|
self,
|
|
101
|
-
model:
|
|
106
|
+
model: MTEBModels,
|
|
102
107
|
data_split: Dataset,
|
|
103
|
-
encode_kwargs:
|
|
108
|
+
encode_kwargs: EncodeKwargs,
|
|
104
109
|
hf_split: str,
|
|
105
110
|
hf_subset: str,
|
|
106
111
|
prediction_folder: Path | None = None,
|
|
107
112
|
**kwargs: Any,
|
|
108
113
|
) -> STSMetrics:
|
|
114
|
+
if not isinstance(model, EncoderProtocol):
|
|
115
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
116
|
+
|
|
109
117
|
normalized_scores = list(map(self._normalize, data_split["score"]))
|
|
110
118
|
data_split = data_split.select_columns(list(self.column_names))
|
|
111
119
|
|
|
@@ -115,6 +123,8 @@ class AbsTaskSTS(AbsTask):
|
|
|
115
123
|
task_metadata=self.metadata,
|
|
116
124
|
hf_split=hf_split,
|
|
117
125
|
hf_subset=hf_subset,
|
|
126
|
+
input1_prompt_type=self.input1_prompt_type,
|
|
127
|
+
input2_prompt_type=self.input2_prompt_type,
|
|
118
128
|
**kwargs,
|
|
119
129
|
)
|
|
120
130
|
scores = evaluator(model, encode_kwargs=encode_kwargs)
|
|
@@ -135,7 +145,7 @@ class AbsTaskSTS(AbsTask):
|
|
|
135
145
|
) -> STSMetrics:
|
|
136
146
|
def compute_corr(x: list[float], y: list[float]) -> tuple[float, float]:
|
|
137
147
|
"""Return (pearson, spearman) correlations between x and y."""
|
|
138
|
-
return pearsonr(x, y)[0], spearmanr(x, y)[0]
|
|
148
|
+
return float(pearsonr(x, y)[0]), float(spearmanr(x, y)[0])
|
|
139
149
|
|
|
140
150
|
cosine_pearson, cosine_spearman = compute_corr(
|
|
141
151
|
normalized_scores, scores["cosine_scores"]
|
mteb/abstasks/task_metadata.py
CHANGED
|
@@ -2,9 +2,10 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
from collections.abc import Sequence
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any, Literal
|
|
5
|
+
from typing import Any, Literal, cast
|
|
6
6
|
|
|
7
7
|
from huggingface_hub import (
|
|
8
|
+
CardData,
|
|
8
9
|
DatasetCard,
|
|
9
10
|
DatasetCardData,
|
|
10
11
|
constants,
|
|
@@ -107,6 +108,7 @@ The domains follow the categories used in the [Universal Dependencies project](h
|
|
|
107
108
|
SampleCreationMethod = Literal[
|
|
108
109
|
"found",
|
|
109
110
|
"created",
|
|
111
|
+
"created and machine-translated",
|
|
110
112
|
"human-translated and localized",
|
|
111
113
|
"human-translated",
|
|
112
114
|
"machine-translated",
|
|
@@ -149,7 +151,7 @@ _TASK_TYPE = (
|
|
|
149
151
|
"InstructionReranking",
|
|
150
152
|
) + MIEB_TASK_TYPE
|
|
151
153
|
|
|
152
|
-
TaskType = Literal[_TASK_TYPE]
|
|
154
|
+
TaskType = Literal[_TASK_TYPE] # type: ignore[valid-type]
|
|
153
155
|
"""The type of the task. E.g. includes "Classification", "Retrieval" and "Clustering"."""
|
|
154
156
|
|
|
155
157
|
|
|
@@ -191,8 +193,10 @@ AnnotatorType = Literal[
|
|
|
191
193
|
"""The type of the annotators. Is often important for understanding the quality of a dataset."""
|
|
192
194
|
|
|
193
195
|
|
|
194
|
-
PromptDict = TypedDict(
|
|
195
|
-
"PromptDict",
|
|
196
|
+
PromptDict = TypedDict( # type: ignore[misc]
|
|
197
|
+
"PromptDict",
|
|
198
|
+
{prompt_type.value: str for prompt_type in PromptType},
|
|
199
|
+
total=False,
|
|
196
200
|
)
|
|
197
201
|
"""A dictionary containing the prompt used for the task.
|
|
198
202
|
|
|
@@ -364,7 +368,7 @@ class TaskMetadata(BaseModel):
|
|
|
364
368
|
"""Return a dictionary mapping huggingface subsets to languages."""
|
|
365
369
|
if isinstance(self.eval_langs, dict):
|
|
366
370
|
return self.eval_langs
|
|
367
|
-
return {"default": self.eval_langs}
|
|
371
|
+
return {"default": cast(list[str], self.eval_langs)}
|
|
368
372
|
|
|
369
373
|
@property
|
|
370
374
|
def intext_citation(self, include_cite: bool = True) -> str:
|
|
@@ -375,9 +379,8 @@ class TaskMetadata(BaseModel):
|
|
|
375
379
|
if include_cite and cite:
|
|
376
380
|
# check for whitespace in the citation
|
|
377
381
|
if " " in cite:
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
)
|
|
382
|
+
msg = "Citation contains whitespace. Please ensure that the citation is correctly formatted."
|
|
383
|
+
logger.warning(msg)
|
|
381
384
|
return f"\\cite{{{cite}}}"
|
|
382
385
|
return cite
|
|
383
386
|
|
|
@@ -413,7 +416,7 @@ class TaskMetadata(BaseModel):
|
|
|
413
416
|
for subset, subset_value in stats.items():
|
|
414
417
|
if subset == "hf_subset_descriptive_stats":
|
|
415
418
|
continue
|
|
416
|
-
n_samples[subset] = subset_value["num_samples"]
|
|
419
|
+
n_samples[subset] = subset_value["num_samples"]
|
|
417
420
|
return n_samples
|
|
418
421
|
|
|
419
422
|
@property
|
|
@@ -446,7 +449,7 @@ class TaskMetadata(BaseModel):
|
|
|
446
449
|
Raises:
|
|
447
450
|
ValueError: If the prompt type is not recognized.
|
|
448
451
|
"""
|
|
449
|
-
if prompt_type is None:
|
|
452
|
+
if prompt_type is None or self.category is None:
|
|
450
453
|
return self.modalities
|
|
451
454
|
query_modalities, doc_modalities = self.category.split("2")
|
|
452
455
|
category_to_modality: dict[str, Modalities] = {
|
|
@@ -466,7 +469,7 @@ class TaskMetadata(BaseModel):
|
|
|
466
469
|
|
|
467
470
|
def _create_dataset_card_data(
|
|
468
471
|
self,
|
|
469
|
-
existing_dataset_card_data:
|
|
472
|
+
existing_dataset_card_data: CardData | None = None,
|
|
470
473
|
) -> tuple[DatasetCardData, dict[str, Any]]:
|
|
471
474
|
"""Create a DatasetCardData object from the task metadata.
|
|
472
475
|
|
|
@@ -482,7 +485,6 @@ class TaskMetadata(BaseModel):
|
|
|
482
485
|
dataset_type = [
|
|
483
486
|
*self._hf_task_type(),
|
|
484
487
|
*self._hf_task_category(),
|
|
485
|
-
*self._hf_subtypes(),
|
|
486
488
|
]
|
|
487
489
|
languages = self._hf_languages()
|
|
488
490
|
|
|
@@ -501,12 +503,13 @@ class TaskMetadata(BaseModel):
|
|
|
501
503
|
|
|
502
504
|
tags = ["mteb"] + self.modalities
|
|
503
505
|
|
|
504
|
-
descriptive_stats =
|
|
505
|
-
if descriptive_stats is not None:
|
|
506
|
-
|
|
506
|
+
descriptive_stats = ""
|
|
507
|
+
if self.descriptive_stats is not None:
|
|
508
|
+
descriptive_stats_ = self.descriptive_stats
|
|
509
|
+
for split, split_stat in descriptive_stats_.items():
|
|
507
510
|
if len(split_stat.get("hf_subset_descriptive_stats", {})) > 10:
|
|
508
511
|
split_stat.pop("hf_subset_descriptive_stats", {})
|
|
509
|
-
descriptive_stats = json.dumps(
|
|
512
|
+
descriptive_stats = json.dumps(descriptive_stats_, indent=4)
|
|
510
513
|
|
|
511
514
|
dataset_card_data_params = existing_dataset_card_data.to_dict()
|
|
512
515
|
# override the existing values
|
|
@@ -583,10 +586,8 @@ class TaskMetadata(BaseModel):
|
|
|
583
586
|
|
|
584
587
|
def _hf_subtypes(self) -> list[str]:
|
|
585
588
|
# to get full list of available task_ids execute
|
|
586
|
-
#
|
|
587
|
-
#
|
|
588
|
-
# "repoType": "dataset"
|
|
589
|
-
# })
|
|
589
|
+
# https://huggingface.co/api/datasets-tags-by-type?type=task_ids
|
|
590
|
+
# ref https://huggingface-openapi.hf.space/#tag/datasets/GET/api/datasets-tags-by-type
|
|
590
591
|
mteb_to_hf_subtype = {
|
|
591
592
|
"Article retrieval": ["document-retrieval"],
|
|
592
593
|
"Conversational retrieval": ["conversational", "utterance-retrieval"],
|
|
@@ -608,7 +609,7 @@ class TaskMetadata(BaseModel):
|
|
|
608
609
|
"hate-speech-detection",
|
|
609
610
|
],
|
|
610
611
|
"Thematic clustering": [],
|
|
611
|
-
"Scientific Reranking": [],
|
|
612
|
+
"Scientific Reranking": ["text-scoring"],
|
|
612
613
|
"Claim verification": ["fact-checking", "fact-checking-retrieval"],
|
|
613
614
|
"Topic classification": ["topic-classification"],
|
|
614
615
|
"Code retrieval": [],
|
|
@@ -616,21 +617,21 @@ class TaskMetadata(BaseModel):
|
|
|
616
617
|
"Cross-Lingual Semantic Discrimination": [],
|
|
617
618
|
"Textual Entailment": ["natural-language-inference"],
|
|
618
619
|
"Counterfactual Detection": [],
|
|
619
|
-
"Emotion classification": [],
|
|
620
|
+
"Emotion classification": ["sentiment-classification"],
|
|
620
621
|
"Reasoning as Retrieval": [],
|
|
621
622
|
"Rendered Texts Understanding": [],
|
|
622
623
|
"Image Text Retrieval": [],
|
|
623
624
|
"Object recognition": [],
|
|
624
625
|
"Scene recognition": [],
|
|
625
626
|
"Caption Pairing": ["image-captioning"],
|
|
626
|
-
"Emotion recognition": [],
|
|
627
|
+
"Emotion recognition": ["sentiment-scoring"],
|
|
627
628
|
"Textures recognition": [],
|
|
628
629
|
"Activity recognition": [],
|
|
629
630
|
"Tumor detection": [],
|
|
630
631
|
"Duplicate Detection": [],
|
|
631
632
|
"Rendered semantic textual similarity": [
|
|
632
633
|
"semantic-similarity-scoring",
|
|
633
|
-
"
|
|
634
|
+
"semantic-similarity-classification",
|
|
634
635
|
],
|
|
635
636
|
"Intent classification": [
|
|
636
637
|
"intent-classification",
|
|
@@ -644,10 +645,8 @@ class TaskMetadata(BaseModel):
|
|
|
644
645
|
|
|
645
646
|
def _hf_task_type(self) -> list[str]:
|
|
646
647
|
# to get full list of task_types execute:
|
|
647
|
-
#
|
|
648
|
-
#
|
|
649
|
-
# }).json()
|
|
650
|
-
# or look at https://huggingface.co/tasks
|
|
648
|
+
# https://huggingface.co/api/datasets-tags-by-type?type=task_categories
|
|
649
|
+
# ref https://huggingface-openapi.hf.space/#tag/datasets/GET/api/datasets-tags-by-type
|
|
651
650
|
mteb_task_type_to_datasets = {
|
|
652
651
|
# Text
|
|
653
652
|
"BitextMining": ["translation"],
|
|
@@ -666,7 +665,7 @@ class TaskMetadata(BaseModel):
|
|
|
666
665
|
"Any2AnyRetrieval": ["visual-document-retrieval"],
|
|
667
666
|
"Any2AnyMultilingualRetrieval": ["visual-document-retrieval"],
|
|
668
667
|
"VisionCentricQA": ["visual-question-answering"],
|
|
669
|
-
"ImageClustering": ["image-
|
|
668
|
+
"ImageClustering": ["image-feature-extraction"],
|
|
670
669
|
"ImageClassification": ["image-classification"],
|
|
671
670
|
"ImageMultilabelClassification": ["image-classification"],
|
|
672
671
|
"DocumentUnderstanding": ["visual-document-retrieval"],
|
|
@@ -694,11 +693,11 @@ class TaskMetadata(BaseModel):
|
|
|
694
693
|
|
|
695
694
|
def _hf_languages(self) -> list[str]:
|
|
696
695
|
languages: list[str] = []
|
|
697
|
-
if self.is_multilingual:
|
|
698
|
-
for val in
|
|
696
|
+
if self.is_multilingual and isinstance(self.eval_langs, dict):
|
|
697
|
+
for val in self.eval_langs.values():
|
|
699
698
|
languages.extend(val)
|
|
700
699
|
else:
|
|
701
|
-
languages = self.eval_langs
|
|
700
|
+
languages = cast(list[str], self.eval_langs)
|
|
702
701
|
# value "python" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters),
|
|
703
702
|
# or a special value like "code", "multilingual".
|
|
704
703
|
readme_langs = []
|
|
@@ -710,7 +709,7 @@ class TaskMetadata(BaseModel):
|
|
|
710
709
|
readme_langs.append(lang_name)
|
|
711
710
|
return sorted(set(readme_langs))
|
|
712
711
|
|
|
713
|
-
def _hf_license(self) -> str:
|
|
712
|
+
def _hf_license(self) -> str | None:
|
|
714
713
|
dataset_license = self.license
|
|
715
714
|
if dataset_license:
|
|
716
715
|
license_mapping = {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any, ClassVar, TypedDict
|
|
4
|
+
from typing import Any, ClassVar, TypedDict, cast
|
|
5
5
|
|
|
6
6
|
from datasets import Dataset, DatasetDict
|
|
7
7
|
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
|
|
@@ -10,7 +10,7 @@ from mteb._evaluators import BitextMiningEvaluator
|
|
|
10
10
|
from mteb.abstasks._statistics_calculation import calculate_text_statistics
|
|
11
11
|
from mteb.abstasks.abstask import AbsTask
|
|
12
12
|
from mteb.models import EncoderProtocol, MTEBModels
|
|
13
|
-
from mteb.types import HFSubset, ScoresDict
|
|
13
|
+
from mteb.types import EncodeKwargs, HFSubset, ScoresDict
|
|
14
14
|
from mteb.types.statistics import SplitDescriptiveStatistics, TextStatistics
|
|
15
15
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
@@ -73,11 +73,14 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
73
73
|
split: str = "test",
|
|
74
74
|
subsets_to_run: list[HFSubset] | None = None,
|
|
75
75
|
*,
|
|
76
|
-
encode_kwargs:
|
|
76
|
+
encode_kwargs: EncodeKwargs,
|
|
77
77
|
prediction_folder: Path | None = None,
|
|
78
78
|
**kwargs: Any,
|
|
79
79
|
) -> dict[HFSubset, ScoresDict]:
|
|
80
80
|
"""Added load for "parallel" datasets"""
|
|
81
|
+
if not isinstance(model, EncoderProtocol):
|
|
82
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
83
|
+
|
|
81
84
|
if not self.data_loaded:
|
|
82
85
|
self.load_data()
|
|
83
86
|
|
|
@@ -87,11 +90,16 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
87
90
|
if subsets_to_run is not None:
|
|
88
91
|
hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
|
|
89
92
|
|
|
90
|
-
|
|
93
|
+
encoder_model = cast(EncoderProtocol, model)
|
|
94
|
+
|
|
95
|
+
if self.dataset is None:
|
|
96
|
+
raise ValueError("Dataset is not loaded.")
|
|
97
|
+
|
|
98
|
+
scores: dict[str, BitextMiningMetrics] = {}
|
|
91
99
|
if self.parallel_subsets:
|
|
92
|
-
scores = self._evaluate_subset(
|
|
93
|
-
|
|
94
|
-
self.dataset[split],
|
|
100
|
+
scores = self._evaluate_subset( # type: ignore[assignment]
|
|
101
|
+
encoder_model,
|
|
102
|
+
self.dataset[split],
|
|
95
103
|
parallel=True,
|
|
96
104
|
hf_split=split,
|
|
97
105
|
hf_subset="parallel",
|
|
@@ -109,8 +117,8 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
109
117
|
data_split = self.dataset[split]
|
|
110
118
|
else:
|
|
111
119
|
data_split = self.dataset[hf_subset][split]
|
|
112
|
-
scores[hf_subset] = self._evaluate_subset(
|
|
113
|
-
|
|
120
|
+
scores[hf_subset] = self._evaluate_subset( # type: ignore[assignment]
|
|
121
|
+
encoder_model,
|
|
114
122
|
data_split,
|
|
115
123
|
hf_split=split,
|
|
116
124
|
hf_subset=hf_subset,
|
|
@@ -119,32 +127,32 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
119
127
|
**kwargs,
|
|
120
128
|
)
|
|
121
129
|
|
|
122
|
-
return scores
|
|
130
|
+
return cast(dict[HFSubset, ScoresDict], scores)
|
|
123
131
|
|
|
124
132
|
def _get_pairs(self, parallel: bool) -> list[tuple[str, str]]:
|
|
125
133
|
pairs = self._DEFAULT_PAIR
|
|
126
134
|
if parallel:
|
|
127
|
-
pairs = [langpair.split("-") for langpair in self.hf_subsets]
|
|
135
|
+
pairs = [langpair.split("-") for langpair in self.hf_subsets] # type: ignore[misc]
|
|
128
136
|
return pairs
|
|
129
137
|
|
|
130
|
-
def _evaluate_subset(
|
|
138
|
+
def _evaluate_subset( # type: ignore[override]
|
|
131
139
|
self,
|
|
132
140
|
model: EncoderProtocol,
|
|
133
141
|
data_split: Dataset,
|
|
134
142
|
*,
|
|
135
143
|
hf_split: str,
|
|
136
144
|
hf_subset: str,
|
|
137
|
-
|
|
138
|
-
encode_kwargs: dict[str, Any],
|
|
145
|
+
encode_kwargs: EncodeKwargs,
|
|
139
146
|
prediction_folder: Path | None = None,
|
|
147
|
+
parallel: bool = False,
|
|
140
148
|
**kwargs,
|
|
141
|
-
) ->
|
|
149
|
+
) -> BitextMiningMetrics | dict[str, BitextMiningMetrics]:
|
|
142
150
|
pairs = self._get_pairs(parallel)
|
|
143
151
|
|
|
144
152
|
evaluator = BitextMiningEvaluator(
|
|
145
153
|
data_split,
|
|
146
154
|
task_metadata=self.metadata,
|
|
147
|
-
pair_columns=pairs,
|
|
155
|
+
pair_columns=pairs,
|
|
148
156
|
hf_split=hf_split,
|
|
149
157
|
hf_subset=hf_subset,
|
|
150
158
|
**kwargs,
|
|
@@ -168,16 +176,16 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
168
176
|
)
|
|
169
177
|
|
|
170
178
|
if parallel:
|
|
171
|
-
|
|
179
|
+
parallel_metrics = {}
|
|
172
180
|
for keys, nearest_neighbors in neighbours.items():
|
|
173
|
-
|
|
181
|
+
parallel_metrics[keys] = self._compute_metrics(nearest_neighbors, gold)
|
|
174
182
|
|
|
175
|
-
for v in
|
|
183
|
+
for v in parallel_metrics.values():
|
|
176
184
|
self._add_main_score(v)
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
185
|
+
return parallel_metrics
|
|
186
|
+
def_pair_str = "-".join(self._DEFAULT_PAIR[0])
|
|
187
|
+
metrics = self._compute_metrics(neighbours[def_pair_str], gold)
|
|
188
|
+
self._add_main_score(metrics)
|
|
181
189
|
return metrics
|
|
182
190
|
|
|
183
191
|
def _compute_metrics(
|
|
@@ -250,8 +258,11 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
250
258
|
)
|
|
251
259
|
|
|
252
260
|
def _push_dataset_to_hub(self, repo_name: str) -> None:
|
|
261
|
+
if self.dataset is None:
|
|
262
|
+
raise ValueError("Dataset is not loaded.")
|
|
263
|
+
|
|
253
264
|
if self.metadata.is_multilingual:
|
|
254
|
-
dataset = defaultdict(dict)
|
|
265
|
+
dataset: dict[str, dict[str, list[str]]] = defaultdict(dict)
|
|
255
266
|
for config in self.metadata.eval_langs:
|
|
256
267
|
logger.info(f"Converting {config} of {self.metadata.name}")
|
|
257
268
|
|
|
@@ -266,10 +277,10 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
266
277
|
for split in self.dataset[config]:
|
|
267
278
|
dataset[split][lang_1] = self.dataset[config][split][sent_1]
|
|
268
279
|
dataset[split][lang_2] = self.dataset[config][split][sent_2]
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
280
|
+
dataset_dict = DatasetDict(
|
|
281
|
+
{split: Dataset.from_dict(dataset[split]) for split in dataset}
|
|
282
|
+
)
|
|
283
|
+
dataset_dict.push_to_hub(repo_name)
|
|
273
284
|
else:
|
|
274
285
|
sentences = {}
|
|
275
286
|
for split in self.dataset:
|
mteb/abstasks/text/reranking.py
CHANGED
|
@@ -16,7 +16,7 @@ else:
|
|
|
16
16
|
|
|
17
17
|
logger = logging.getLogger(__name__)
|
|
18
18
|
|
|
19
|
-
OLD_FORMAT_RERANKING_TASKS = []
|
|
19
|
+
OLD_FORMAT_RERANKING_TASKS: list[str] = []
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
@deprecated(
|
|
@@ -100,12 +100,14 @@ class AbsTaskReranking(AbsTaskRetrieval):
|
|
|
100
100
|
if self.metadata.name not in OLD_FORMAT_RERANKING_TASKS:
|
|
101
101
|
return
|
|
102
102
|
|
|
103
|
-
|
|
103
|
+
logger.info(
|
|
104
104
|
f"Transforming old format to standard format for {self.metadata.name}"
|
|
105
105
|
)
|
|
106
106
|
|
|
107
107
|
given_dataset = copy(given_dataset)
|
|
108
|
-
self.dataset = defaultdict(
|
|
108
|
+
self.dataset: dict[str, dict[str, RetrievalSplitData]] = defaultdict(
|
|
109
|
+
lambda: defaultdict(dict) # type: ignore[arg-type]
|
|
110
|
+
)
|
|
109
111
|
|
|
110
112
|
hf_subsets = self.hf_subsets
|
|
111
113
|
|
|
@@ -115,19 +117,19 @@ class AbsTaskReranking(AbsTaskRetrieval):
|
|
|
115
117
|
if hf_subset in cur_dataset:
|
|
116
118
|
cur_dataset = cur_dataset[hf_subset]
|
|
117
119
|
elif "name" in self.metadata.dataset:
|
|
118
|
-
cur_dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
120
|
+
cur_dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
119
121
|
assert hf_subset == "default", (
|
|
120
122
|
f"Only default subset is supported for {self.metadata.name} since `name` is given in the metadata."
|
|
121
123
|
)
|
|
122
124
|
else:
|
|
123
125
|
cur_dataset = datasets.load_dataset(
|
|
124
126
|
**self.metadata.dataset, name=hf_subset
|
|
125
|
-
)
|
|
127
|
+
)
|
|
126
128
|
|
|
127
129
|
for split in cur_dataset:
|
|
128
130
|
corpus = []
|
|
129
131
|
queries = []
|
|
130
|
-
relevant_docs = defaultdict(dict)
|
|
132
|
+
relevant_docs: dict[str, dict[str, int]] = defaultdict(dict)
|
|
131
133
|
top_ranked = defaultdict(list)
|
|
132
134
|
|
|
133
135
|
# Create an enumerated dataset to pass indices
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
4
|
import numpy as np
|
|
6
5
|
from datasets import Dataset
|
|
@@ -12,7 +11,8 @@ from mteb.abstasks._statistics_calculation import (
|
|
|
12
11
|
calculate_text_statistics,
|
|
13
12
|
)
|
|
14
13
|
from mteb.abstasks.abstask import AbsTask
|
|
15
|
-
from mteb.models import EncoderProtocol
|
|
14
|
+
from mteb.models import EncoderProtocol, MTEBModels
|
|
15
|
+
from mteb.types import EncodeKwargs
|
|
16
16
|
from mteb.types.statistics import (
|
|
17
17
|
ScoreStatistics,
|
|
18
18
|
SplitDescriptiveStatistics,
|
|
@@ -77,17 +77,22 @@ class AbsTaskSummarization(AbsTask):
|
|
|
77
77
|
|
|
78
78
|
def _evaluate_subset(
|
|
79
79
|
self,
|
|
80
|
-
model:
|
|
80
|
+
model: MTEBModels,
|
|
81
81
|
data_split: Dataset,
|
|
82
82
|
*,
|
|
83
83
|
hf_split: str,
|
|
84
84
|
hf_subset: str,
|
|
85
|
-
encode_kwargs:
|
|
85
|
+
encode_kwargs: EncodeKwargs,
|
|
86
86
|
prediction_folder: Path | None = None,
|
|
87
87
|
**kwargs,
|
|
88
88
|
) -> SummarizationMetrics:
|
|
89
|
+
if not isinstance(model, EncoderProtocol):
|
|
90
|
+
raise TypeError("Expected model to be an instance of EncoderProtocol")
|
|
91
|
+
|
|
89
92
|
normalized_scores = [
|
|
90
|
-
(
|
|
93
|
+
(
|
|
94
|
+
(np.array(x) - self.min_score) / (self.max_score - self.min_score)
|
|
95
|
+
).tolist()
|
|
91
96
|
for x in data_split[self.relevancy_column_name]
|
|
92
97
|
]
|
|
93
98
|
evaluator = self.evaluator(
|