mteb 2.0.5__py3-none-any.whl → 2.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +10 -1
- mteb/_create_dataloaders.py +8 -3
- mteb/_evaluators/any_sts_evaluator.py +14 -12
- mteb/_evaluators/clustering_evaluator.py +1 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +2 -2
- mteb/_evaluators/pair_classification_evaluator.py +3 -1
- mteb/_evaluators/retrieval_metrics.py +0 -9
- mteb/_evaluators/sklearn_evaluator.py +15 -28
- mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
- mteb/_evaluators/text/summarization_evaluator.py +4 -2
- mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
- mteb/abstasks/_stratification.py +1 -1
- mteb/abstasks/abstask.py +6 -1
- mteb/abstasks/clustering.py +1 -1
- mteb/abstasks/dataset_card_template.md +1 -1
- mteb/abstasks/multilabel_classification.py +2 -2
- mteb/abstasks/retrieval.py +2 -1
- mteb/abstasks/retrieval_dataset_loaders.py +1 -1
- mteb/abstasks/task_metadata.py +2 -1
- mteb/benchmarks/_create_table.py +1 -3
- mteb/benchmarks/benchmark.py +18 -1
- mteb/benchmarks/benchmarks/__init__.py +4 -0
- mteb/benchmarks/benchmarks/benchmarks.py +125 -16
- mteb/benchmarks/get_benchmark.py +3 -1
- mteb/cache.py +7 -3
- mteb/descriptive_stats/Classification/DutchColaClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json +90 -0
- mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json +54 -0
- mteb/descriptive_stats/Classification/IconclassClassification.json +96 -0
- mteb/descriptive_stats/Classification/OpenTenderClassification.json +222 -0
- mteb/descriptive_stats/Classification/VaccinChatNLClassification.json +1068 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringP2P.json +45 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringS2S.json +45 -0
- mteb/descriptive_stats/Clustering/IconclassClusteringS2S.json +48 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringP2P.json +111 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringS2S.json +111 -0
- mteb/descriptive_stats/Clustering/VABBClusteringP2P.json +60 -0
- mteb/descriptive_stats/Clustering/VABBClusteringS2S.json +60 -0
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XFlickr30kCoT2IRetrieval.json +243 -153
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XM3600T2IRetrieval.json +999 -629
- mteb/descriptive_stats/Image/Any2AnyRetrieval/OVENIT2TRetrieval.json +33 -17
- mteb/descriptive_stats/Image/DocumentUnderstanding/MIRACLVisionRetrieval.json +574 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/MultilabelClassification/CovidDisinformationNLMultiLabelClassification.json +84 -0
- mteb/descriptive_stats/MultilabelClassification/VABBMultiLabelClassification.json +156 -0
- mteb/descriptive_stats/PairClassification/SICKNLPairClassification.json +35 -0
- mteb/descriptive_stats/PairClassification/XLWICNLPairClassification.json +35 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/ClimateFEVERHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/DBPediaHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/DutchNewsArticlesRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/FEVERHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/HotpotQAHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/LegalQANLRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/OpenTenderRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/QuoraRetrievalHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/RiaNewsRetrievalHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/VABBRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json +184 -0
- mteb/descriptive_stats/Retrieval/WinoGrande.json +14 -14
- mteb/descriptive_stats/Retrieval/bBSARDNLRetrieval.json +30 -0
- mteb/descriptive_stats/STS/SICK-NL-STS.json +28 -0
- mteb/evaluate.py +26 -6
- mteb/languages/check_language_code.py +11 -3
- mteb/languages/language_scripts.py +4 -0
- mteb/leaderboard/app.py +5 -3
- mteb/leaderboard/benchmark_selector.py +4 -2
- mteb/leaderboard/text_segments.py +1 -1
- mteb/models/cache_wrappers/cache_wrapper.py +1 -1
- mteb/models/instruct_wrapper.py +3 -0
- mteb/models/model_implementations/align_models.py +6 -0
- mteb/models/model_implementations/andersborges.py +51 -0
- mteb/models/model_implementations/ara_models.py +7 -0
- mteb/models/model_implementations/b1ade_models.py +1 -1
- mteb/models/model_implementations/bge_models.py +1 -3
- mteb/models/model_implementations/blip2_models.py +9 -0
- mteb/models/model_implementations/blip_models.py +19 -0
- mteb/models/model_implementations/bmretriever_models.py +1 -1
- mteb/models/model_implementations/cadet_models.py +8 -0
- mteb/models/model_implementations/cde_models.py +12 -0
- mteb/models/model_implementations/codefuse_models.py +15 -0
- mteb/models/model_implementations/codesage_models.py +12 -0
- mteb/models/model_implementations/cohere_models.py +1 -1
- mteb/models/model_implementations/colqwen_models.py +57 -0
- mteb/models/model_implementations/emillykkejensen_models.py +70 -0
- mteb/models/model_implementations/gme_v_models.py +2 -2
- mteb/models/model_implementations/ibm_granite_models.py +1 -1
- mteb/models/model_implementations/inf_models.py +3 -3
- mteb/models/model_implementations/jasper_models.py +253 -2
- mteb/models/model_implementations/jina_models.py +12 -2
- mteb/models/model_implementations/kalm_models.py +159 -25
- mteb/models/model_implementations/llm2vec_models.py +1 -1
- mteb/models/model_implementations/misc_models.py +8 -2
- mteb/models/model_implementations/moco_models.py +9 -0
- mteb/models/model_implementations/mxbai_models.py +1 -1
- mteb/models/model_implementations/openclip_models.py +16 -0
- mteb/models/model_implementations/piccolo_models.py +6 -0
- mteb/models/model_implementations/rasgaard_models.py +33 -0
- mteb/models/model_implementations/reasonir_model.py +1 -1
- mteb/models/model_implementations/salesforce_models.py +1 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +26 -0
- mteb/models/model_implementations/tarka_models.py +374 -0
- mteb/models/model_implementations/voyage_models.py +6 -7
- mteb/models/model_implementations/voyage_v.py +10 -9
- mteb/models/model_implementations/yuan_models.py +33 -0
- mteb/models/search_wrappers.py +6 -5
- mteb/results/task_result.py +19 -17
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +1 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +2 -3
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +15 -121
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +1 -2
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +1 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +2 -3
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/__init__.py +16 -0
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +41 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +40 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +33 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +39 -0
- mteb/tasks/classification/nld/iconclass_classification.py +44 -0
- mteb/tasks/classification/nld/open_tender_classification.py +41 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +49 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/__init__.py +1 -0
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/__init__.py +17 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +40 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +40 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +50 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +54 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +44 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +54 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +54 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/multilabel_classification/__init__.py +1 -0
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/__init__.py +9 -0
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +91 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +47 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/__init__.py +1 -0
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py +9 -8
- mteb/tasks/pair_classification/nld/__init__.py +7 -0
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +39 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +44 -0
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/code_rag.py +8 -8
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +18 -4
- mteb/tasks/retrieval/eng/climate_fever_retrieval.py +68 -77
- mteb/tasks/retrieval/eng/dbpedia_retrieval.py +55 -50
- mteb/tasks/retrieval/eng/fever_retrieval.py +62 -67
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/hotpot_qa_retrieval.py +57 -67
- mteb/tasks/retrieval/eng/legal_summarization_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +0 -3
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +0 -2
- mteb/tasks/retrieval/eng/oven_it2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/quora_retrieval.py +51 -46
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/eng/wino_grande_retrieval.py +1 -1
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/multilingual/__init__.py +22 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +6 -5
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/miracl_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +2 -9
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +6 -5
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +3 -4
- mteb/tasks/retrieval/nld/__init__.py +18 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +44 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +33 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +42 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +41 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +44 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/rus/__init__.py +11 -2
- mteb/tasks/retrieval/rus/ria_news_retrieval.py +48 -44
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/tur/tur_hist_quad.py +2 -2
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/sts/__init__.py +1 -0
- mteb/tasks/sts/nld/__init__.py +5 -0
- mteb/tasks/sts/nld/sick_nl_sts.py +42 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb-2.1.19.dist-info/METADATA +253 -0
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/RECORD +398 -330
- mteb/descriptive_stats/Classification/PersianTextTone.json +0 -56
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json +0 -37
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json +0 -25
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json +0 -25
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json +0 -25
- mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json +0 -220
- mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json +0 -402
- mteb/descriptive_stats/Reranking/InstructIR.json +0 -31
- mteb-2.0.5.dist-info/METADATA +0 -455
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/WHEEL +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/entry_points.txt +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/top_level.txt +0 -0
|
@@ -1,30 +1,21 @@
|
|
|
1
1
|
from mteb.abstasks.retrieval import AbsTaskRetrieval
|
|
2
2
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
main_score="ndcg_at_10",
|
|
20
|
-
date=("2017-01-01", "2017-01-01"), # best guess: based on publication date
|
|
21
|
-
domains=["Written", "Encyclopaedic"],
|
|
22
|
-
task_subtypes=[],
|
|
23
|
-
license="mit",
|
|
24
|
-
annotations_creators="derived",
|
|
25
|
-
dialect=[],
|
|
26
|
-
sample_creation="found",
|
|
27
|
-
bibtex_citation=r"""
|
|
4
|
+
_dbpedia_metadata = dict(
|
|
5
|
+
type="Retrieval",
|
|
6
|
+
category="t2t",
|
|
7
|
+
modalities=["text"],
|
|
8
|
+
eval_splits=["test"],
|
|
9
|
+
eval_langs=["eng-Latn"],
|
|
10
|
+
main_score="ndcg_at_10",
|
|
11
|
+
date=("2017-01-01", "2017-01-01"), # best guess: based on publication date
|
|
12
|
+
domains=["Written", "Encyclopaedic"],
|
|
13
|
+
task_subtypes=[],
|
|
14
|
+
license="mit",
|
|
15
|
+
annotations_creators="derived",
|
|
16
|
+
dialect=[],
|
|
17
|
+
sample_creation="found",
|
|
18
|
+
bibtex_citation=r"""
|
|
28
19
|
@inproceedings{Hasibi:2017:DVT,
|
|
29
20
|
author = {Hasibi, Faegheh and Nikolaev, Fedor and Xiong, Chenyan and Balog, Krisztian and Bratsberg, Svein Erik and Kotov, Alexander and Callan, Jamie},
|
|
30
21
|
booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval},
|
|
@@ -36,45 +27,59 @@ class DBPedia(AbsTaskRetrieval):
|
|
|
36
27
|
year = {2017},
|
|
37
28
|
}
|
|
38
29
|
""",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class DBPedia(AbsTaskRetrieval):
|
|
34
|
+
metadata = TaskMetadata(
|
|
35
|
+
name="DBPedia",
|
|
36
|
+
description="DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base",
|
|
37
|
+
reference="https://github.com/iai-group/DBpedia-Entity/",
|
|
38
|
+
dataset={
|
|
39
|
+
"path": "mteb/dbpedia",
|
|
40
|
+
"revision": "c0f706b76e590d620bd6618b3ca8efdd34e2d659",
|
|
41
|
+
},
|
|
39
42
|
prompt={
|
|
40
43
|
"query": "Given a query, retrieve relevant entity descriptions from DBPedia"
|
|
41
44
|
},
|
|
45
|
+
**_dbpedia_metadata,
|
|
42
46
|
)
|
|
43
47
|
|
|
44
48
|
|
|
45
49
|
class DBPediaHardNegatives(AbsTaskRetrieval):
|
|
46
50
|
metadata = TaskMetadata(
|
|
47
51
|
name="DBPediaHardNegatives",
|
|
48
|
-
description=
|
|
52
|
+
description=(
|
|
53
|
+
"DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base. "
|
|
54
|
+
"The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
|
|
55
|
+
),
|
|
56
|
+
reference="https://github.com/iai-group/DBpedia-Entity/",
|
|
57
|
+
dataset={
|
|
58
|
+
"path": "mteb/DBPedia_test_top_250_only_w_correct-v2",
|
|
59
|
+
"revision": "943ec7fdfef3728b2ad1966c5b6479ff9ffd26c9",
|
|
60
|
+
},
|
|
61
|
+
superseded_by="DBPediaHardNegatives.v2",
|
|
62
|
+
adapted_from=["DBPedia"],
|
|
63
|
+
**_dbpedia_metadata,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class DBPediaHardNegativesV2(AbsTaskRetrieval):
|
|
68
|
+
metadata = TaskMetadata(
|
|
69
|
+
name="DBPediaHardNegatives.v2",
|
|
70
|
+
description=(
|
|
71
|
+
"DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base. "
|
|
72
|
+
"The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct. "
|
|
73
|
+
"V2 uses a more appropriate prompt rather than the default prompt for retrieval. You can get more information on the effect of different prompt in the [PR](https://github.com/embeddings-benchmark/mteb/pull/3469#issuecomment-3436467106)"
|
|
74
|
+
),
|
|
49
75
|
reference="https://github.com/iai-group/DBpedia-Entity/",
|
|
50
76
|
dataset={
|
|
51
77
|
"path": "mteb/DBPedia_test_top_250_only_w_correct-v2",
|
|
52
78
|
"revision": "943ec7fdfef3728b2ad1966c5b6479ff9ffd26c9",
|
|
53
79
|
},
|
|
54
|
-
type="Retrieval",
|
|
55
|
-
category="t2t",
|
|
56
|
-
modalities=["text"],
|
|
57
|
-
eval_splits=["test"],
|
|
58
|
-
eval_langs=["eng-Latn"],
|
|
59
|
-
main_score="ndcg_at_10",
|
|
60
|
-
date=("2017-01-01", "2017-01-01"), # best guess: based on publication date
|
|
61
|
-
domains=["Written", "Encyclopaedic"],
|
|
62
|
-
task_subtypes=[],
|
|
63
|
-
license="mit",
|
|
64
|
-
annotations_creators="derived",
|
|
65
|
-
dialect=[],
|
|
66
|
-
sample_creation="found",
|
|
67
|
-
bibtex_citation=r"""
|
|
68
|
-
@inproceedings{Hasibi:2017:DVT,
|
|
69
|
-
author = {Hasibi, Faegheh and Nikolaev, Fedor and Xiong, Chenyan and Balog, Krisztian and Bratsberg, Svein Erik and Kotov, Alexander and Callan, Jamie},
|
|
70
|
-
booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval},
|
|
71
|
-
doi = {10.1145/3077136.3080751},
|
|
72
|
-
pages = {1265--1268},
|
|
73
|
-
publisher = {ACM},
|
|
74
|
-
series = {SIGIR '17},
|
|
75
|
-
title = {DBpedia-Entity V2: A Test Collection for Entity Search},
|
|
76
|
-
year = {2017},
|
|
77
|
-
}
|
|
78
|
-
""",
|
|
79
80
|
adapted_from=["DBPedia"],
|
|
81
|
+
prompt={
|
|
82
|
+
"query": "Given a query, retrieve relevant entity descriptions from DBPedia"
|
|
83
|
+
},
|
|
84
|
+
**_dbpedia_metadata,
|
|
80
85
|
)
|
|
@@ -1,36 +1,22 @@
|
|
|
1
1
|
from mteb.abstasks.retrieval import AbsTaskRetrieval
|
|
2
2
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
type="Retrieval",
|
|
21
|
-
category="t2t",
|
|
22
|
-
modalities=["text"],
|
|
23
|
-
eval_splits=["test"],
|
|
24
|
-
eval_langs=["eng-Latn"],
|
|
25
|
-
main_score="ndcg_at_10",
|
|
26
|
-
date=None,
|
|
27
|
-
domains=["Encyclopaedic", "Written"],
|
|
28
|
-
task_subtypes=["Claim verification"],
|
|
29
|
-
license="cc-by-nc-sa-3.0",
|
|
30
|
-
annotations_creators="human-annotated",
|
|
31
|
-
dialect=[],
|
|
32
|
-
sample_creation="found",
|
|
33
|
-
bibtex_citation=r"""
|
|
4
|
+
_fever_metadata = dict(
|
|
5
|
+
reference="https://fever.ai/",
|
|
6
|
+
type="Retrieval",
|
|
7
|
+
category="t2t",
|
|
8
|
+
modalities=["text"],
|
|
9
|
+
eval_splits=["test"],
|
|
10
|
+
eval_langs=["eng-Latn"],
|
|
11
|
+
main_score="ndcg_at_10",
|
|
12
|
+
date=None,
|
|
13
|
+
domains=["Encyclopaedic", "Written"],
|
|
14
|
+
task_subtypes=["Claim verification"],
|
|
15
|
+
license="cc-by-nc-sa-3.0",
|
|
16
|
+
annotations_creators="human-annotated",
|
|
17
|
+
dialect=[],
|
|
18
|
+
sample_creation="found",
|
|
19
|
+
bibtex_citation=r"""
|
|
34
20
|
@inproceedings{thorne-etal-2018-fever,
|
|
35
21
|
address = {New Orleans, Louisiana},
|
|
36
22
|
author = {Thorne, James and
|
|
@@ -50,9 +36,27 @@ Stent, Amanda},
|
|
|
50
36
|
year = {2018},
|
|
51
37
|
}
|
|
52
38
|
""",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class FEVER(AbsTaskRetrieval):
|
|
43
|
+
ignore_identical_ids = True
|
|
44
|
+
|
|
45
|
+
metadata = TaskMetadata(
|
|
46
|
+
name="FEVER",
|
|
47
|
+
dataset={
|
|
48
|
+
"path": "mteb/fever",
|
|
49
|
+
"revision": "bea83ef9e8fb933d90a2f1d5515737465d613e12",
|
|
50
|
+
},
|
|
51
|
+
description=(
|
|
52
|
+
"FEVER (Fact Extraction and VERification) consists of 185,445 claims generated by altering sentences "
|
|
53
|
+
"extracted from Wikipedia and subsequently verified without knowledge of the sentence they were "
|
|
54
|
+
"derived from."
|
|
55
|
+
),
|
|
53
56
|
prompt={
|
|
54
57
|
"query": "Given a claim, retrieve documents that support or refute the claim"
|
|
55
58
|
},
|
|
59
|
+
**_fever_metadata,
|
|
56
60
|
)
|
|
57
61
|
|
|
58
62
|
|
|
@@ -66,43 +70,34 @@ class FEVERHardNegatives(AbsTaskRetrieval):
|
|
|
66
70
|
"revision": "080c9ed6267b65029207906e815d44a9240bafca",
|
|
67
71
|
},
|
|
68
72
|
description=(
|
|
69
|
-
"FEVER (Fact Extraction and VERification) consists of 185,445 claims generated by altering sentences"
|
|
70
|
-
|
|
71
|
-
|
|
73
|
+
"FEVER (Fact Extraction and VERification) consists of 185,445 claims generated by altering sentences "
|
|
74
|
+
"extracted from Wikipedia and subsequently verified without knowledge of the sentence they were "
|
|
75
|
+
"derived from. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
|
|
76
|
+
),
|
|
77
|
+
adapted_from=["FEVER"],
|
|
78
|
+
superseded_by="FEVERHardNegatives.v2",
|
|
79
|
+
**_fever_metadata,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class FEVERHardNegativesV2(AbsTaskRetrieval):
|
|
84
|
+
ignore_identical_ids = True
|
|
85
|
+
|
|
86
|
+
metadata = TaskMetadata(
|
|
87
|
+
name="FEVERHardNegatives.v2",
|
|
88
|
+
dataset={
|
|
89
|
+
"path": "mteb/FEVER_test_top_250_only_w_correct-v2",
|
|
90
|
+
"revision": "080c9ed6267b65029207906e815d44a9240bafca",
|
|
91
|
+
},
|
|
92
|
+
description=(
|
|
93
|
+
"FEVER (Fact Extraction and VERification) consists of 185,445 claims generated by altering sentences "
|
|
94
|
+
"extracted from Wikipedia and subsequently verified without knowledge of the sentence they were "
|
|
95
|
+
"derived from. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct. "
|
|
96
|
+
"V2 uses a more appropriate prompt rather than the default prompt for retrieval. You can get more information on the effect of different prompt in the [PR](https://github.com/embeddings-benchmark/mteb/pull/3469#issuecomment-3436467106)"
|
|
72
97
|
),
|
|
73
|
-
reference="https://fever.ai/",
|
|
74
|
-
type="Retrieval",
|
|
75
|
-
category="t2t",
|
|
76
|
-
modalities=["text"],
|
|
77
|
-
eval_splits=["test"],
|
|
78
|
-
eval_langs=["eng-Latn"],
|
|
79
|
-
main_score="ndcg_at_10",
|
|
80
|
-
date=None,
|
|
81
|
-
domains=["Encyclopaedic", "Written"],
|
|
82
|
-
task_subtypes=["Claim verification"],
|
|
83
|
-
license="cc-by-nc-sa-3.0",
|
|
84
|
-
annotations_creators="human-annotated",
|
|
85
|
-
dialect=None,
|
|
86
|
-
sample_creation=None,
|
|
87
|
-
bibtex_citation=r"""
|
|
88
|
-
@inproceedings{thorne-etal-2018-fever,
|
|
89
|
-
address = {New Orleans, Louisiana},
|
|
90
|
-
author = {Thorne, James and
|
|
91
|
-
Vlachos, Andreas and
|
|
92
|
-
Christodoulopoulos, Christos and
|
|
93
|
-
Mittal, Arpit},
|
|
94
|
-
booktitle = {Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)},
|
|
95
|
-
doi = {10.18653/v1/N18-1074},
|
|
96
|
-
editor = {Walker, Marilyn and
|
|
97
|
-
Ji, Heng and
|
|
98
|
-
Stent, Amanda},
|
|
99
|
-
month = jun,
|
|
100
|
-
pages = {809--819},
|
|
101
|
-
publisher = {Association for Computational Linguistics},
|
|
102
|
-
title = {{FEVER}: a Large-scale Dataset for Fact Extraction and {VER}ification},
|
|
103
|
-
url = {https://aclanthology.org/N18-1074},
|
|
104
|
-
year = {2018},
|
|
105
|
-
}
|
|
106
|
-
""",
|
|
107
98
|
adapted_from=["FEVER"],
|
|
99
|
+
prompt={
|
|
100
|
+
"query": "Given a claim, retrieve documents that support or refute the claim"
|
|
101
|
+
},
|
|
102
|
+
**_fever_metadata,
|
|
108
103
|
)
|
|
@@ -24,9 +24,7 @@ def _load_data(path: str, splits: str, revision: str | None = None):
|
|
|
24
24
|
shared_corpus = shared_corpus.map(
|
|
25
25
|
lambda x: {
|
|
26
26
|
"id": "corpus-" + str(x["id"]),
|
|
27
|
-
# "text": x["text"],
|
|
28
27
|
"modality": "text",
|
|
29
|
-
"image": None,
|
|
30
28
|
},
|
|
31
29
|
remove_columns=[
|
|
32
30
|
"split",
|
|
@@ -40,9 +38,7 @@ def _load_data(path: str, splits: str, revision: str | None = None):
|
|
|
40
38
|
queries[split] = split_dataset.map(
|
|
41
39
|
lambda x: {
|
|
42
40
|
"id": "query-" + str(x["id"]),
|
|
43
|
-
"text": None,
|
|
44
41
|
"modality": "image",
|
|
45
|
-
# "image": x["image"],
|
|
46
42
|
},
|
|
47
43
|
remove_columns=[
|
|
48
44
|
"split",
|
|
@@ -24,9 +24,7 @@ def _load_data(path: str, splits: str, revision: str | None = None):
|
|
|
24
24
|
shared_corpus = shared_corpus.map(
|
|
25
25
|
lambda x: {
|
|
26
26
|
"id": "corpus-" + str(x["id"]),
|
|
27
|
-
"text": None,
|
|
28
27
|
"modality": "image",
|
|
29
|
-
# "image": None,
|
|
30
28
|
},
|
|
31
29
|
remove_columns=[
|
|
32
30
|
"split",
|
|
@@ -40,9 +38,7 @@ def _load_data(path: str, splits: str, revision: str | None = None):
|
|
|
40
38
|
queries[split] = split_dataset.map(
|
|
41
39
|
lambda x: {
|
|
42
40
|
"id": "query-" + str(x["id"]),
|
|
43
|
-
# "text": None,
|
|
44
41
|
"modality": "text",
|
|
45
|
-
"image": None,
|
|
46
42
|
},
|
|
47
43
|
remove_columns=[
|
|
48
44
|
"split",
|
|
@@ -1,33 +1,22 @@
|
|
|
1
1
|
from mteb.abstasks.retrieval import AbsTaskRetrieval
|
|
2
2
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
eval_splits=["test"],
|
|
21
|
-
eval_langs=["eng-Latn"],
|
|
22
|
-
main_score="ndcg_at_10",
|
|
23
|
-
date=("2018-01-01", "2018-12-31"), # best guess: based on publication date
|
|
24
|
-
domains=["Web", "Written"],
|
|
25
|
-
task_subtypes=["Question answering"],
|
|
26
|
-
license="cc-by-sa-4.0",
|
|
27
|
-
annotations_creators="human-annotated",
|
|
28
|
-
dialect=[],
|
|
29
|
-
sample_creation="found",
|
|
30
|
-
bibtex_citation=r"""
|
|
4
|
+
_hotpot_qa_metadata = dict(
|
|
5
|
+
reference="https://hotpotqa.github.io/",
|
|
6
|
+
type="Retrieval",
|
|
7
|
+
category="t2t",
|
|
8
|
+
modalities=["text"],
|
|
9
|
+
eval_splits=["test"],
|
|
10
|
+
eval_langs=["eng-Latn"],
|
|
11
|
+
main_score="ndcg_at_10",
|
|
12
|
+
date=("2018-01-01", "2018-12-31"), # best guess: based on publication date
|
|
13
|
+
domains=["Web", "Written"],
|
|
14
|
+
task_subtypes=["Question answering"],
|
|
15
|
+
license="cc-by-sa-4.0",
|
|
16
|
+
annotations_creators="human-annotated",
|
|
17
|
+
dialect=[],
|
|
18
|
+
sample_creation="found",
|
|
19
|
+
bibtex_citation=r"""
|
|
31
20
|
@inproceedings{yang-etal-2018-hotpotqa,
|
|
32
21
|
address = {Brussels, Belgium},
|
|
33
22
|
author = {Yang, Zhilin and
|
|
@@ -51,9 +40,24 @@ Tsujii, Jun{'}ichi},
|
|
|
51
40
|
year = {2018},
|
|
52
41
|
}
|
|
53
42
|
""",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class HotpotQA(AbsTaskRetrieval):
|
|
47
|
+
metadata = TaskMetadata(
|
|
48
|
+
name="HotpotQA",
|
|
49
|
+
dataset={
|
|
50
|
+
"path": "mteb/hotpotqa",
|
|
51
|
+
"revision": "ab518f4d6fcca38d87c25209f94beba119d02014",
|
|
52
|
+
},
|
|
53
|
+
description=(
|
|
54
|
+
"HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong "
|
|
55
|
+
"supervision for supporting facts to enable more explainable question answering systems."
|
|
56
|
+
),
|
|
54
57
|
prompt={
|
|
55
58
|
"query": "Given a multi-hop question, retrieve documents that can help answer the question"
|
|
56
59
|
},
|
|
60
|
+
**_hotpot_qa_metadata,
|
|
57
61
|
)
|
|
58
62
|
|
|
59
63
|
|
|
@@ -65,46 +69,32 @@ class HotpotQAHardNegatives(AbsTaskRetrieval):
|
|
|
65
69
|
"revision": "617612fa63afcb60e3b134bed8b7216a99707c37",
|
|
66
70
|
},
|
|
67
71
|
description=(
|
|
68
|
-
"HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong"
|
|
69
|
-
|
|
72
|
+
"HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong "
|
|
73
|
+
"supervision for supporting facts to enable more explainable question answering systems. "
|
|
74
|
+
"The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
|
|
70
75
|
),
|
|
71
|
-
reference="https://hotpotqa.github.io/",
|
|
72
|
-
type="Retrieval",
|
|
73
|
-
category="t2t",
|
|
74
|
-
modalities=["text"],
|
|
75
|
-
eval_splits=["test"],
|
|
76
|
-
eval_langs=["eng-Latn"],
|
|
77
|
-
main_score="ndcg_at_10",
|
|
78
|
-
date=("2018-01-01", "2018-12-31"), # best guess: based on publication date
|
|
79
|
-
domains=["Web", "Written"],
|
|
80
|
-
task_subtypes=["Question answering"],
|
|
81
|
-
license="cc-by-sa-4.0",
|
|
82
|
-
annotations_creators="human-annotated",
|
|
83
|
-
dialect=[],
|
|
84
|
-
sample_creation="found",
|
|
85
|
-
bibtex_citation=r"""
|
|
86
|
-
@inproceedings{yang-etal-2018-hotpotqa,
|
|
87
|
-
address = {Brussels, Belgium},
|
|
88
|
-
author = {Yang, Zhilin and
|
|
89
|
-
Qi, Peng and
|
|
90
|
-
Zhang, Saizheng and
|
|
91
|
-
Bengio, Yoshua and
|
|
92
|
-
Cohen, William and
|
|
93
|
-
Salakhutdinov, Ruslan and
|
|
94
|
-
Manning, Christopher D.},
|
|
95
|
-
booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
|
|
96
|
-
doi = {10.18653/v1/D18-1259},
|
|
97
|
-
editor = {Riloff, Ellen and
|
|
98
|
-
Chiang, David and
|
|
99
|
-
Hockenmaier, Julia and
|
|
100
|
-
Tsujii, Jun{'}ichi},
|
|
101
|
-
month = oct # {-} # nov,
|
|
102
|
-
pages = {2369--2380},
|
|
103
|
-
publisher = {Association for Computational Linguistics},
|
|
104
|
-
title = {{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering},
|
|
105
|
-
url = {https://aclanthology.org/D18-1259},
|
|
106
|
-
year = {2018},
|
|
107
|
-
}
|
|
108
|
-
""",
|
|
109
76
|
adapted_from=["HotpotQA"],
|
|
77
|
+
superseded_by="HotpotQAHardNegatives.v2",
|
|
78
|
+
**_hotpot_qa_metadata,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class HotpotQAHardNegativesV2(AbsTaskRetrieval):
|
|
83
|
+
metadata = TaskMetadata(
|
|
84
|
+
name="HotpotQAHardNegatives.v2",
|
|
85
|
+
dataset={
|
|
86
|
+
"path": "mteb/HotpotQA_test_top_250_only_w_correct-v2",
|
|
87
|
+
"revision": "617612fa63afcb60e3b134bed8b7216a99707c37",
|
|
88
|
+
},
|
|
89
|
+
description=(
|
|
90
|
+
"HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong "
|
|
91
|
+
"supervision for supporting facts to enable more explainable question answering systems. "
|
|
92
|
+
"The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
|
|
93
|
+
"V2 uses a more appropriate prompt rather than the default prompt for retrieval. You can get more information on the effect of different prompt in the [PR](https://github.com/embeddings-benchmark/mteb/pull/3469#issuecomment-3436467106)"
|
|
94
|
+
),
|
|
95
|
+
adapted_from=["HotpotQA"],
|
|
96
|
+
prompt={
|
|
97
|
+
"query": "Given a multi-hop question, retrieve documents that can help answer the question"
|
|
98
|
+
},
|
|
99
|
+
**_hotpot_qa_metadata,
|
|
110
100
|
)
|
|
@@ -5,7 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
|
|
|
5
5
|
class LegalSummarization(AbsTaskRetrieval):
|
|
6
6
|
metadata = TaskMetadata(
|
|
7
7
|
name="LegalSummarization",
|
|
8
|
-
description="The dataset
|
|
8
|
+
description="The dataset consists of 439 pairs of contracts and their summarizations from https://tldrlegal.com and https://tosdr.org/.",
|
|
9
9
|
reference="https://github.com/lauramanor/legal_summarization",
|
|
10
10
|
dataset={
|
|
11
11
|
"path": "mteb/legal_summarization",
|
|
@@ -7,14 +7,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
|
|
|
7
7
|
class LitSearchRetrieval(AbsTaskRetrieval):
|
|
8
8
|
metadata = TaskMetadata(
|
|
9
9
|
name="LitSearchRetrieval",
|
|
10
|
-
description=""
|
|
11
|
-
The dataset contains the query set and retrieval corpus for the paper LitSearch: A Retrieval Benchmark for
|
|
12
|
-
Scientific Literature Search. It introduces LitSearch, a retrieval benchmark comprising 597 realistic literature
|
|
13
|
-
search queries about recent ML and NLP papers. LitSearch is constructed using a combination of (1) questions
|
|
14
|
-
generated by GPT-4 based on paragraphs containing inline citations from research papers and (2) questions about
|
|
15
|
-
recently published papers, manually written by their authors. All LitSearch questions were manually examined or
|
|
16
|
-
edited by experts to ensure high quality.
|
|
17
|
-
""",
|
|
10
|
+
description="The dataset contains the query set and retrieval corpus for the paper LitSearch: A Retrieval Benchmark for Scientific Literature Search. It introduces LitSearch, a retrieval benchmark comprising 597 realistic literature search queries about recent ML and NLP papers. LitSearch is constructed using a combination of (1) questions generated by GPT-4 based on paragraphs containing inline citations from research papers and (2) questions about recently published papers, manually written by their authors. All LitSearch questions were manually examined or edited by experts to ensure high quality.",
|
|
18
11
|
reference="https://github.com/princeton-nlp/LitSearch",
|
|
19
12
|
dataset={
|
|
20
13
|
"path": "princeton-nlp/LitSearch",
|
|
@@ -20,7 +20,6 @@ def _load_data(path: str, splits: str, revision: str | None = None):
|
|
|
20
20
|
"id": f"corpus-{split_name}-{idx}",
|
|
21
21
|
"text": x["text_corrected"],
|
|
22
22
|
"modality": "text",
|
|
23
|
-
"image": None,
|
|
24
23
|
}
|
|
25
24
|
|
|
26
25
|
split_datasets = {}
|
|
@@ -56,9 +55,7 @@ def _load_data(path: str, splits: str, revision: str | None = None):
|
|
|
56
55
|
queries[split] = split_dataset.map(
|
|
57
56
|
lambda x, idx: {
|
|
58
57
|
"id": f"query-{split}-{idx}",
|
|
59
|
-
"text": None,
|
|
60
58
|
"modality": "image",
|
|
61
|
-
# "image": None,
|
|
62
59
|
},
|
|
63
60
|
with_indices=True,
|
|
64
61
|
remove_columns=[
|
|
@@ -18,7 +18,6 @@ def _load_data(path: str, splits: str, revision: str | None = None):
|
|
|
18
18
|
def map_function(split_name):
|
|
19
19
|
return lambda x, idx: {
|
|
20
20
|
"id": f"corpus-{split_name}-{idx}",
|
|
21
|
-
"text": None,
|
|
22
21
|
"modality": "image",
|
|
23
22
|
}
|
|
24
23
|
|
|
@@ -56,7 +55,6 @@ def _load_data(path: str, splits: str, revision: str | None = None):
|
|
|
56
55
|
"id": f"query-{split}-{idx}",
|
|
57
56
|
"text": x["text_corrected"],
|
|
58
57
|
"modality": "text",
|
|
59
|
-
"image": None,
|
|
60
58
|
},
|
|
61
59
|
with_indices=True,
|
|
62
60
|
remove_columns=[
|
|
@@ -1,6 +1,32 @@
|
|
|
1
1
|
from mteb.abstasks.retrieval import AbsTaskRetrieval
|
|
2
2
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
3
|
|
|
4
|
+
_quora_metadata = dict(
|
|
5
|
+
reference="https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs",
|
|
6
|
+
type="Retrieval",
|
|
7
|
+
category="t2t",
|
|
8
|
+
modalities=["text"],
|
|
9
|
+
eval_splits=["test"],
|
|
10
|
+
eval_langs=["eng-Latn"],
|
|
11
|
+
main_score="ndcg_at_10",
|
|
12
|
+
date=None,
|
|
13
|
+
domains=["Written", "Web", "Blog"],
|
|
14
|
+
task_subtypes=["Question answering"],
|
|
15
|
+
license="not specified",
|
|
16
|
+
annotations_creators="human-annotated",
|
|
17
|
+
dialect=[],
|
|
18
|
+
sample_creation="found",
|
|
19
|
+
bibtex_citation=r"""
|
|
20
|
+
@misc{quora-question-pairs,
|
|
21
|
+
author = {DataCanary, hilfialkaff, Lili Jiang, Meg Risdal, Nikhil Dandekar, tomtung},
|
|
22
|
+
publisher = {Kaggle},
|
|
23
|
+
title = {Quora Question Pairs},
|
|
24
|
+
url = {https://kaggle.com/competitions/quora-question-pairs},
|
|
25
|
+
year = {2017},
|
|
26
|
+
}
|
|
27
|
+
""",
|
|
28
|
+
)
|
|
29
|
+
|
|
4
30
|
|
|
5
31
|
class QuoraRetrieval(AbsTaskRetrieval):
|
|
6
32
|
ignore_identical_ids = True
|
|
@@ -15,32 +41,10 @@ class QuoraRetrieval(AbsTaskRetrieval):
|
|
|
15
41
|
"QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a"
|
|
16
42
|
+ " question, find other (duplicate) questions."
|
|
17
43
|
),
|
|
18
|
-
reference="https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs",
|
|
19
|
-
type="Retrieval",
|
|
20
|
-
category="t2t",
|
|
21
|
-
modalities=["text"],
|
|
22
|
-
eval_splits=["test"],
|
|
23
|
-
eval_langs=["eng-Latn"],
|
|
24
|
-
main_score="ndcg_at_10",
|
|
25
|
-
date=None,
|
|
26
|
-
domains=["Written", "Web", "Blog"],
|
|
27
|
-
task_subtypes=["Question answering"],
|
|
28
|
-
license="not specified",
|
|
29
|
-
annotations_creators="human-annotated",
|
|
30
|
-
dialect=[],
|
|
31
|
-
sample_creation="found",
|
|
32
|
-
bibtex_citation=r"""
|
|
33
|
-
@misc{quora-question-pairs,
|
|
34
|
-
author = {DataCanary, hilfialkaff, Lili Jiang, Meg Risdal, Nikhil Dandekar, tomtung},
|
|
35
|
-
publisher = {Kaggle},
|
|
36
|
-
title = {Quora Question Pairs},
|
|
37
|
-
url = {https://kaggle.com/competitions/quora-question-pairs},
|
|
38
|
-
year = {2017},
|
|
39
|
-
}
|
|
40
|
-
""",
|
|
41
44
|
prompt={
|
|
42
45
|
"query": "Given a question, retrieve questions that are semantically equivalent to the given question"
|
|
43
46
|
},
|
|
47
|
+
**_quora_metadata,
|
|
44
48
|
)
|
|
45
49
|
|
|
46
50
|
|
|
@@ -57,28 +61,29 @@ class QuoraRetrievalHardNegatives(AbsTaskRetrieval):
|
|
|
57
61
|
"QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a"
|
|
58
62
|
+ " question, find other (duplicate) questions. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
|
|
59
63
|
),
|
|
60
|
-
reference="https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs",
|
|
61
|
-
type="Retrieval",
|
|
62
|
-
category="t2t",
|
|
63
|
-
modalities=["text"],
|
|
64
|
-
eval_splits=["test"],
|
|
65
|
-
eval_langs=["eng-Latn"],
|
|
66
|
-
main_score="ndcg_at_10",
|
|
67
|
-
date=None,
|
|
68
|
-
domains=None,
|
|
69
|
-
task_subtypes=None,
|
|
70
|
-
license=None,
|
|
71
|
-
annotations_creators=None,
|
|
72
|
-
dialect=None,
|
|
73
|
-
sample_creation=None,
|
|
74
|
-
bibtex_citation=r"""
|
|
75
|
-
@misc{quora-question-pairs,
|
|
76
|
-
author = {DataCanary, hilfialkaff, Lili Jiang, Meg Risdal, Nikhil Dandekar, tomtung},
|
|
77
|
-
publisher = {Kaggle},
|
|
78
|
-
title = {Quora Question Pairs},
|
|
79
|
-
url = {https://kaggle.com/competitions/quora-question-pairs},
|
|
80
|
-
year = {2017},
|
|
81
|
-
}
|
|
82
|
-
""",
|
|
83
64
|
adapted_from=["QuoraRetrieval"],
|
|
65
|
+
superseded_by="QuoraRetrievalHardNegatives.v2",
|
|
66
|
+
**_quora_metadata,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class QuoraRetrievalHardNegativesV2(AbsTaskRetrieval):
|
|
71
|
+
ignore_identical_ids = True
|
|
72
|
+
|
|
73
|
+
metadata = TaskMetadata(
|
|
74
|
+
name="QuoraRetrievalHardNegatives.v2",
|
|
75
|
+
dataset={
|
|
76
|
+
"path": "mteb/QuoraRetrieval_test_top_250_only_w_correct-v2",
|
|
77
|
+
"revision": "907a33577e9506221d3ba20f5a851b7c3f8dc6d3",
|
|
78
|
+
},
|
|
79
|
+
description=(
|
|
80
|
+
"QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a "
|
|
81
|
+
"question, find other (duplicate) questions. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
|
|
82
|
+
"V2 uses a more appropriate prompt rather than the default prompt for retrieval. You can get more information on the effect of different prompt in the [PR](https://github.com/embeddings-benchmark/mteb/pull/3469#issuecomment-3436467106)"
|
|
83
|
+
),
|
|
84
|
+
adapted_from=["QuoraRetrieval"],
|
|
85
|
+
prompt={
|
|
86
|
+
"query": "Given a question, retrieve questions that are semantically equivalent to the given question"
|
|
87
|
+
},
|
|
88
|
+
**_quora_metadata,
|
|
84
89
|
)
|