mteb 2.0.5__py3-none-any.whl → 2.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +10 -1
- mteb/_create_dataloaders.py +8 -3
- mteb/_evaluators/any_sts_evaluator.py +14 -12
- mteb/_evaluators/clustering_evaluator.py +1 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +2 -2
- mteb/_evaluators/pair_classification_evaluator.py +3 -1
- mteb/_evaluators/retrieval_metrics.py +0 -9
- mteb/_evaluators/sklearn_evaluator.py +15 -28
- mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
- mteb/_evaluators/text/summarization_evaluator.py +4 -2
- mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
- mteb/abstasks/_stratification.py +1 -1
- mteb/abstasks/abstask.py +6 -1
- mteb/abstasks/clustering.py +1 -1
- mteb/abstasks/dataset_card_template.md +1 -1
- mteb/abstasks/multilabel_classification.py +2 -2
- mteb/abstasks/retrieval.py +2 -1
- mteb/abstasks/retrieval_dataset_loaders.py +1 -1
- mteb/abstasks/task_metadata.py +2 -1
- mteb/benchmarks/_create_table.py +1 -3
- mteb/benchmarks/benchmark.py +18 -1
- mteb/benchmarks/benchmarks/__init__.py +4 -0
- mteb/benchmarks/benchmarks/benchmarks.py +125 -16
- mteb/benchmarks/get_benchmark.py +3 -1
- mteb/cache.py +7 -3
- mteb/descriptive_stats/Classification/DutchColaClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json +90 -0
- mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json +54 -0
- mteb/descriptive_stats/Classification/IconclassClassification.json +96 -0
- mteb/descriptive_stats/Classification/OpenTenderClassification.json +222 -0
- mteb/descriptive_stats/Classification/VaccinChatNLClassification.json +1068 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringP2P.json +45 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringS2S.json +45 -0
- mteb/descriptive_stats/Clustering/IconclassClusteringS2S.json +48 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringP2P.json +111 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringS2S.json +111 -0
- mteb/descriptive_stats/Clustering/VABBClusteringP2P.json +60 -0
- mteb/descriptive_stats/Clustering/VABBClusteringS2S.json +60 -0
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XFlickr30kCoT2IRetrieval.json +243 -153
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XM3600T2IRetrieval.json +999 -629
- mteb/descriptive_stats/Image/Any2AnyRetrieval/OVENIT2TRetrieval.json +33 -17
- mteb/descriptive_stats/Image/DocumentUnderstanding/MIRACLVisionRetrieval.json +574 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/MultilabelClassification/CovidDisinformationNLMultiLabelClassification.json +84 -0
- mteb/descriptive_stats/MultilabelClassification/VABBMultiLabelClassification.json +156 -0
- mteb/descriptive_stats/PairClassification/SICKNLPairClassification.json +35 -0
- mteb/descriptive_stats/PairClassification/XLWICNLPairClassification.json +35 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/ClimateFEVERHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/DBPediaHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/DutchNewsArticlesRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/FEVERHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/HotpotQAHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/LegalQANLRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/OpenTenderRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/QuoraRetrievalHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/RiaNewsRetrievalHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/VABBRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json +184 -0
- mteb/descriptive_stats/Retrieval/WinoGrande.json +14 -14
- mteb/descriptive_stats/Retrieval/bBSARDNLRetrieval.json +30 -0
- mteb/descriptive_stats/STS/SICK-NL-STS.json +28 -0
- mteb/evaluate.py +26 -6
- mteb/languages/check_language_code.py +11 -3
- mteb/languages/language_scripts.py +4 -0
- mteb/leaderboard/app.py +5 -3
- mteb/leaderboard/benchmark_selector.py +4 -2
- mteb/leaderboard/text_segments.py +1 -1
- mteb/models/cache_wrappers/cache_wrapper.py +1 -1
- mteb/models/instruct_wrapper.py +3 -0
- mteb/models/model_implementations/align_models.py +6 -0
- mteb/models/model_implementations/andersborges.py +51 -0
- mteb/models/model_implementations/ara_models.py +7 -0
- mteb/models/model_implementations/b1ade_models.py +1 -1
- mteb/models/model_implementations/bge_models.py +1 -3
- mteb/models/model_implementations/blip2_models.py +9 -0
- mteb/models/model_implementations/blip_models.py +19 -0
- mteb/models/model_implementations/bmretriever_models.py +1 -1
- mteb/models/model_implementations/cadet_models.py +8 -0
- mteb/models/model_implementations/cde_models.py +12 -0
- mteb/models/model_implementations/codefuse_models.py +15 -0
- mteb/models/model_implementations/codesage_models.py +12 -0
- mteb/models/model_implementations/cohere_models.py +1 -1
- mteb/models/model_implementations/colqwen_models.py +57 -0
- mteb/models/model_implementations/emillykkejensen_models.py +70 -0
- mteb/models/model_implementations/gme_v_models.py +2 -2
- mteb/models/model_implementations/ibm_granite_models.py +1 -1
- mteb/models/model_implementations/inf_models.py +3 -3
- mteb/models/model_implementations/jasper_models.py +253 -2
- mteb/models/model_implementations/jina_models.py +12 -2
- mteb/models/model_implementations/kalm_models.py +159 -25
- mteb/models/model_implementations/llm2vec_models.py +1 -1
- mteb/models/model_implementations/misc_models.py +8 -2
- mteb/models/model_implementations/moco_models.py +9 -0
- mteb/models/model_implementations/mxbai_models.py +1 -1
- mteb/models/model_implementations/openclip_models.py +16 -0
- mteb/models/model_implementations/piccolo_models.py +6 -0
- mteb/models/model_implementations/rasgaard_models.py +33 -0
- mteb/models/model_implementations/reasonir_model.py +1 -1
- mteb/models/model_implementations/salesforce_models.py +1 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +26 -0
- mteb/models/model_implementations/tarka_models.py +374 -0
- mteb/models/model_implementations/voyage_models.py +6 -7
- mteb/models/model_implementations/voyage_v.py +10 -9
- mteb/models/model_implementations/yuan_models.py +33 -0
- mteb/models/search_wrappers.py +6 -5
- mteb/results/task_result.py +19 -17
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +1 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +2 -3
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +15 -121
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +1 -2
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +1 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +2 -3
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/__init__.py +16 -0
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +41 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +40 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +33 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +39 -0
- mteb/tasks/classification/nld/iconclass_classification.py +44 -0
- mteb/tasks/classification/nld/open_tender_classification.py +41 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +49 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/__init__.py +1 -0
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/__init__.py +17 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +40 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +40 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +50 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +54 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +44 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +54 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +54 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/multilabel_classification/__init__.py +1 -0
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/__init__.py +9 -0
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +91 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +47 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/__init__.py +1 -0
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py +9 -8
- mteb/tasks/pair_classification/nld/__init__.py +7 -0
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +39 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +44 -0
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/code_rag.py +8 -8
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +18 -4
- mteb/tasks/retrieval/eng/climate_fever_retrieval.py +68 -77
- mteb/tasks/retrieval/eng/dbpedia_retrieval.py +55 -50
- mteb/tasks/retrieval/eng/fever_retrieval.py +62 -67
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/hotpot_qa_retrieval.py +57 -67
- mteb/tasks/retrieval/eng/legal_summarization_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +0 -3
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +0 -2
- mteb/tasks/retrieval/eng/oven_it2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/quora_retrieval.py +51 -46
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/eng/wino_grande_retrieval.py +1 -1
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/multilingual/__init__.py +22 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +6 -5
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/miracl_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +2 -9
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +6 -5
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +3 -4
- mteb/tasks/retrieval/nld/__init__.py +18 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +44 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +33 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +42 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +41 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +44 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/rus/__init__.py +11 -2
- mteb/tasks/retrieval/rus/ria_news_retrieval.py +48 -44
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/tur/tur_hist_quad.py +2 -2
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/sts/__init__.py +1 -0
- mteb/tasks/sts/nld/__init__.py +5 -0
- mteb/tasks/sts/nld/sick_nl_sts.py +42 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb-2.1.19.dist-info/METADATA +253 -0
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/RECORD +398 -330
- mteb/descriptive_stats/Classification/PersianTextTone.json +0 -56
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json +0 -37
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json +0 -25
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json +0 -25
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json +0 -25
- mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json +0 -220
- mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json +0 -402
- mteb/descriptive_stats/Reranking/InstructIR.json +0 -31
- mteb-2.0.5.dist-info/METADATA +0 -455
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/WHEEL +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/entry_points.txt +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/top_level.txt +0 -0
|
@@ -3,6 +3,8 @@ from dataclasses import dataclass
|
|
|
3
3
|
|
|
4
4
|
from typing_extensions import Self
|
|
5
5
|
|
|
6
|
+
from mteb.languages import check_language_code
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
@dataclass
|
|
8
10
|
class LanguageScripts:
|
|
@@ -46,8 +48,10 @@ class LanguageScripts:
|
|
|
46
48
|
if len(lang_script) == 2:
|
|
47
49
|
normalized_langs.add(lang_script[0])
|
|
48
50
|
lang_script_codes.add(lang)
|
|
51
|
+
check_language_code(lang)
|
|
49
52
|
script_codes.add(lang_script[1])
|
|
50
53
|
else:
|
|
54
|
+
check_language_code(lang)
|
|
51
55
|
normalized_langs.add(lang)
|
|
52
56
|
|
|
53
57
|
return cls(
|
mteb/leaderboard/app.py
CHANGED
|
@@ -107,7 +107,9 @@ def _update_description(
|
|
|
107
107
|
description += f" - **Number of task types**: {n_task_types}\n"
|
|
108
108
|
description += f" - **Number of domains**: {n_domains}\n"
|
|
109
109
|
if benchmark.reference is not None:
|
|
110
|
-
description +=
|
|
110
|
+
description += (
|
|
111
|
+
f'\n<a href="{benchmark.reference}" target="_blank">Click for More Info</a>'
|
|
112
|
+
)
|
|
111
113
|
|
|
112
114
|
return description
|
|
113
115
|
|
|
@@ -137,7 +139,7 @@ def _update_task_info(task_names: str) -> gr.DataFrame:
|
|
|
137
139
|
df["languages"] = df["languages"].map(_format_list)
|
|
138
140
|
df = df.sort_values("name")
|
|
139
141
|
df["domains"] = df["domains"].map(_format_list)
|
|
140
|
-
df["name"] = "
|
|
142
|
+
df["name"] = f'<a href="{df["reference"]}" target="_blank">{df["name"]}</a>'
|
|
141
143
|
df["modalities"] = df["modalities"].map(_format_list)
|
|
142
144
|
df = df.rename(
|
|
143
145
|
columns={
|
|
@@ -318,7 +320,7 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
318
320
|
"""
|
|
319
321
|
## Embedding Leaderboard
|
|
320
322
|
|
|
321
|
-
This leaderboard compares 100+ text and image embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://
|
|
323
|
+
This leaderboard compares 100+ text and image embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://embeddings-benchmark.github.io/mteb/contributing/adding_a_model/), [add benchmarks](https://embeddings-benchmark.github.io/mteb/contributing/adding_a_benchmark/), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/issues/new?template=enhancement.yaml).
|
|
322
324
|
"""
|
|
323
325
|
)
|
|
324
326
|
gr.Markdown(
|
|
@@ -73,6 +73,7 @@ GP_BENCHMARK_ENTRIES = [
|
|
|
73
73
|
"MTEB(fra, v1)",
|
|
74
74
|
"MTEB(jpn, v1)",
|
|
75
75
|
"MTEB(kor, v1)",
|
|
76
|
+
"MTEB(nld, v1)",
|
|
76
77
|
"MTEB(pol, v1)",
|
|
77
78
|
"MTEB(rus, v1)",
|
|
78
79
|
"MTEB(fas, v2)",
|
|
@@ -109,10 +110,11 @@ R_BENCHMARK_ENTRIES = [
|
|
|
109
110
|
MenuEntry(
|
|
110
111
|
"Image",
|
|
111
112
|
description=None,
|
|
112
|
-
open=
|
|
113
|
+
open=True,
|
|
113
114
|
benchmarks=[
|
|
114
|
-
mteb.get_benchmark("
|
|
115
|
+
mteb.get_benchmark("ViDoRe(v3)"),
|
|
115
116
|
mteb.get_benchmark("JinaVDR"),
|
|
117
|
+
MenuEntry("Other", [mteb.get_benchmark("ViDoRe(v1&v2)")]),
|
|
116
118
|
],
|
|
117
119
|
),
|
|
118
120
|
MenuEntry(
|
|
@@ -53,7 +53,7 @@ ACKNOWLEDGEMENT = """
|
|
|
53
53
|
<img src="https://play-lh.googleusercontent.com/HdfHZ5jnfMM1Ep7XpPaVdFIVSRx82wKlRC_qmnHx9H1E4aWNp4WKoOcH0x95NAnuYg" width="60" height="55" style="padding: 10px;">
|
|
54
54
|
</a>
|
|
55
55
|
<a href="https://huggingface.co">
|
|
56
|
-
<img src="https://raw.githubusercontent.com/embeddings-benchmark/mteb/main/docs/images/hf_logo.png" width="60" height="55" style="padding: 10px;">
|
|
56
|
+
<img src="https://raw.githubusercontent.com/embeddings-benchmark/mteb/main/docs/images/logos/hf_logo.png" width="60" height="55" style="padding: 10px;">
|
|
57
57
|
</a>
|
|
58
58
|
</div>
|
|
59
59
|
|
mteb/models/instruct_wrapper.py
CHANGED
|
@@ -153,6 +153,9 @@ class InstructSentenceTransformerModel(AbsEncoder):
|
|
|
153
153
|
|
|
154
154
|
self.model_name = model_name
|
|
155
155
|
self.model = SentenceTransformer(model_name, revision=revision, **kwargs)
|
|
156
|
+
if max_seq_length:
|
|
157
|
+
# https://github.com/huggingface/sentence-transformers/issues/3575
|
|
158
|
+
self.model.max_seq_length = max_seq_length
|
|
156
159
|
self.apply_instruction_to_passages = apply_instruction_to_passages
|
|
157
160
|
self.prompts_dict = prompts_dict
|
|
158
161
|
|
|
@@ -124,4 +124,10 @@ align_base = ModelMeta(
|
|
|
124
124
|
training_datasets=set(
|
|
125
125
|
# COYO-700M
|
|
126
126
|
),
|
|
127
|
+
citation="""@misc{kakaobrain2022coyo-align,
|
|
128
|
+
title = {COYO-ALIGN},
|
|
129
|
+
author = {Yoon, Boogeo and Lee, Youhan and Baek, Woonhyuk},
|
|
130
|
+
year = {2022},
|
|
131
|
+
howpublished = {https://github.com/kakaobrain/coyo-align},
|
|
132
|
+
}""",
|
|
127
133
|
)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from mteb.models.model_implementations.model2vec_models import Model2VecModel
|
|
4
|
+
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
5
|
+
|
|
6
|
+
model2vecdk = ModelMeta(
|
|
7
|
+
loader=Model2VecModel, # type: ignore
|
|
8
|
+
name="andersborges/model2vecdk",
|
|
9
|
+
languages=["dan-Latn"],
|
|
10
|
+
open_weights=True,
|
|
11
|
+
revision="cb576c78dcc1b729e4612645f61db59929d69e61",
|
|
12
|
+
release_date="2025-11-21",
|
|
13
|
+
n_parameters=48042496,
|
|
14
|
+
memory_usage_mb=183,
|
|
15
|
+
max_tokens=np.inf,
|
|
16
|
+
embed_dim=256,
|
|
17
|
+
license="mit",
|
|
18
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
19
|
+
framework=["NumPy", "Sentence Transformers"],
|
|
20
|
+
reference="https://huggingface.co/andersborges/model2vecdk",
|
|
21
|
+
use_instructions=False,
|
|
22
|
+
adapted_from="https://huggingface.co/jealk/TTC-L2V-supervised-2",
|
|
23
|
+
superseded_by=None,
|
|
24
|
+
training_datasets=set(), # distilled
|
|
25
|
+
public_training_code="https://github.com/andersborges/dkmodel2vec",
|
|
26
|
+
public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
model2vecdk_stem = ModelMeta(
|
|
31
|
+
loader=Model2VecModel, # type: ignore
|
|
32
|
+
name="andersborges/model2vecdk-stem",
|
|
33
|
+
languages=["dan-Latn"],
|
|
34
|
+
open_weights=True,
|
|
35
|
+
revision="cb576c78dcc1b729e4612645f61db59929d69e61",
|
|
36
|
+
release_date="2025-11-21",
|
|
37
|
+
n_parameters=48578560,
|
|
38
|
+
memory_usage_mb=185,
|
|
39
|
+
max_tokens=np.inf,
|
|
40
|
+
embed_dim=256,
|
|
41
|
+
license="mit",
|
|
42
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
43
|
+
framework=["NumPy", "Sentence Transformers"],
|
|
44
|
+
reference="https://huggingface.co/andersborges/model2vecdk",
|
|
45
|
+
use_instructions=False,
|
|
46
|
+
adapted_from="https://huggingface.co/jealk/TTC-L2V-supervised-2",
|
|
47
|
+
superseded_by=None,
|
|
48
|
+
training_datasets=set(), # distilled
|
|
49
|
+
public_training_code="https://github.com/andersborges/dkmodel2vec",
|
|
50
|
+
public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
|
|
51
|
+
)
|
|
@@ -23,4 +23,11 @@ arabic_triplet_matryoshka = ModelMeta(
|
|
|
23
23
|
training_datasets=set(
|
|
24
24
|
# "akhooli/arabic-triplets-1m-curated-sims-len"
|
|
25
25
|
),
|
|
26
|
+
citation="""
|
|
27
|
+
@article{nacar2025gate,
|
|
28
|
+
title={GATE: General Arabic Text Embedding for Enhanced Semantic Textual Similarity with Matryoshka Representation Learning and Hybrid Loss Training},
|
|
29
|
+
author={Nacar, Omer and Koubaa, Anis and Sibaee, Serry and Al-Habashi, Yasser and Ammar, Adel and Boulila, Wadii},
|
|
30
|
+
journal={arXiv preprint arXiv:2505.24581},
|
|
31
|
+
year={2025}
|
|
32
|
+
}""",
|
|
26
33
|
)
|
|
@@ -2,7 +2,7 @@ from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
|
2
2
|
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
|
|
3
3
|
|
|
4
4
|
b1ade_training_data = {
|
|
5
|
-
# We are in
|
|
5
|
+
# We are in the process of submitting a paper outlining our process of creating b1ade using model merging and knowledge distillation.
|
|
6
6
|
# Similar to mixedbread models, we do not train on any data (except the MSMarco training split) of MTEB.
|
|
7
7
|
"MSMARCO",
|
|
8
8
|
}
|
|
@@ -62,7 +62,7 @@ bge_m3_training_data = {
|
|
|
62
62
|
# mMARCO-ZH
|
|
63
63
|
# LawGPT
|
|
64
64
|
# NLI-zh2, LeCaRDv2,
|
|
65
|
-
# NLI, MultiLongDoc (their
|
|
65
|
+
# NLI, MultiLongDoc (their synthetic)
|
|
66
66
|
# + synthetic data
|
|
67
67
|
}
|
|
68
68
|
|
|
@@ -141,7 +141,6 @@ bge_chinese_training_data = {
|
|
|
141
141
|
# https://huggingface.co/BAAI/bge-m3/discussions/29
|
|
142
142
|
bgem3_languages = [
|
|
143
143
|
"afr-Latn", # af
|
|
144
|
-
# als
|
|
145
144
|
"amh-Ethi", # am
|
|
146
145
|
# an
|
|
147
146
|
# ar
|
|
@@ -151,7 +150,6 @@ bgem3_languages = [
|
|
|
151
150
|
# av
|
|
152
151
|
# az
|
|
153
152
|
"azj-Latn", # azb
|
|
154
|
-
# ba
|
|
155
153
|
# bar
|
|
156
154
|
# bcl
|
|
157
155
|
"ben-Beng", # be
|
|
@@ -10,6 +10,13 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
10
10
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
11
|
from mteb.types import Array, BatchedInput, PromptType
|
|
12
12
|
|
|
13
|
+
BLIP2_CITATION = """@inproceedings{li2023blip2,
|
|
14
|
+
title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
|
|
15
|
+
author={Junnan Li and Dongxu Li and Silvio Savarese and Steven Hoi},
|
|
16
|
+
year={2023},
|
|
17
|
+
booktitle={ICML},
|
|
18
|
+
}"""
|
|
19
|
+
|
|
13
20
|
|
|
14
21
|
def blip2_loader(model_name, **kwargs):
|
|
15
22
|
requires_package(
|
|
@@ -176,6 +183,7 @@ blip2_opt_2_7b = ModelMeta(
|
|
|
176
183
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
177
184
|
use_instructions=False,
|
|
178
185
|
training_datasets=blip2_training_datasets,
|
|
186
|
+
citation=BLIP2_CITATION,
|
|
179
187
|
)
|
|
180
188
|
|
|
181
189
|
blip2_opt_6_7b_coco = ModelMeta(
|
|
@@ -198,4 +206,5 @@ blip2_opt_6_7b_coco = ModelMeta(
|
|
|
198
206
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
199
207
|
use_instructions=False,
|
|
200
208
|
training_datasets=blip2_training_datasets,
|
|
209
|
+
citation=BLIP2_CITATION,
|
|
201
210
|
)
|
|
@@ -10,6 +10,17 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
10
10
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
11
|
from mteb.types import Array, BatchedInput, PromptType
|
|
12
12
|
|
|
13
|
+
BLIP_CITATION = """@misc{https://doi.org/10.48550/arxiv.2201.12086,
|
|
14
|
+
doi = {10.48550/ARXIV.2201.12086},
|
|
15
|
+
url = {https://arxiv.org/abs/2201.12086},
|
|
16
|
+
author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
|
|
17
|
+
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
|
|
18
|
+
title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
|
|
19
|
+
publisher = {arXiv},
|
|
20
|
+
year = {2022},
|
|
21
|
+
copyright = {Creative Commons Attribution 4.0 International}
|
|
22
|
+
}"""
|
|
23
|
+
|
|
13
24
|
|
|
14
25
|
class BLIPModel(AbsEncoder):
|
|
15
26
|
def __init__(
|
|
@@ -140,6 +151,7 @@ blip_image_captioning_large = ModelMeta(
|
|
|
140
151
|
# CC3M+CC12M+SBU
|
|
141
152
|
# LAION115M
|
|
142
153
|
),
|
|
154
|
+
citation=BLIP_CITATION,
|
|
143
155
|
)
|
|
144
156
|
|
|
145
157
|
blip_image_captioning_base = ModelMeta(
|
|
@@ -166,6 +178,7 @@ blip_image_captioning_base = ModelMeta(
|
|
|
166
178
|
# CC3M+CC12M+SBU
|
|
167
179
|
# LAION115M
|
|
168
180
|
),
|
|
181
|
+
citation=BLIP_CITATION,
|
|
169
182
|
)
|
|
170
183
|
|
|
171
184
|
|
|
@@ -192,6 +205,7 @@ blip_vqa_base = ModelMeta(
|
|
|
192
205
|
# CC3M+CC12M+SBU
|
|
193
206
|
# LAION115M
|
|
194
207
|
),
|
|
208
|
+
citation=BLIP_CITATION,
|
|
195
209
|
)
|
|
196
210
|
|
|
197
211
|
blip_vqa_capfilt_large = ModelMeta(
|
|
@@ -217,6 +231,7 @@ blip_vqa_capfilt_large = ModelMeta(
|
|
|
217
231
|
# CC3M+CC12M+SBU
|
|
218
232
|
# LAION115M
|
|
219
233
|
),
|
|
234
|
+
citation=BLIP_CITATION,
|
|
220
235
|
)
|
|
221
236
|
|
|
222
237
|
blip_itm_base_coco = ModelMeta(
|
|
@@ -242,6 +257,7 @@ blip_itm_base_coco = ModelMeta(
|
|
|
242
257
|
# CC3M+CC12M+SBU
|
|
243
258
|
# LAION115M
|
|
244
259
|
),
|
|
260
|
+
citation=BLIP_CITATION,
|
|
245
261
|
)
|
|
246
262
|
|
|
247
263
|
blip_itm_large_coco = ModelMeta(
|
|
@@ -268,6 +284,7 @@ blip_itm_large_coco = ModelMeta(
|
|
|
268
284
|
# CC3M+CC12M+SBU
|
|
269
285
|
# LAION115M
|
|
270
286
|
),
|
|
287
|
+
citation=BLIP_CITATION,
|
|
271
288
|
)
|
|
272
289
|
|
|
273
290
|
blip_itm_base_flickr = ModelMeta(
|
|
@@ -294,6 +311,7 @@ blip_itm_base_flickr = ModelMeta(
|
|
|
294
311
|
# LAION115M
|
|
295
312
|
# Flickr30k
|
|
296
313
|
),
|
|
314
|
+
citation=BLIP_CITATION,
|
|
297
315
|
)
|
|
298
316
|
|
|
299
317
|
blip_itm_large_flickr = ModelMeta(
|
|
@@ -319,4 +337,5 @@ blip_itm_large_flickr = ModelMeta(
|
|
|
319
337
|
# CC3M+CC12M+SBU
|
|
320
338
|
# LAION115M
|
|
321
339
|
),
|
|
340
|
+
citation=BLIP_CITATION,
|
|
322
341
|
)
|
|
@@ -48,7 +48,7 @@ class BMRetrieverWrapper(InstructSentenceTransformerModel):
|
|
|
48
48
|
if padding_side is not None:
|
|
49
49
|
tokenizer_params["padding_side"] = padding_side
|
|
50
50
|
kwargs.setdefault("tokenizer_args", {}).update(tokenizer_params)
|
|
51
|
-
kwargs.setdefault("config_args", {}).update(
|
|
51
|
+
kwargs.setdefault("config_args", {}).update(revision=revision)
|
|
52
52
|
|
|
53
53
|
transformer = Transformer(
|
|
54
54
|
model_name,
|
|
@@ -3,6 +3,13 @@ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loade
|
|
|
3
3
|
|
|
4
4
|
from .bge_models import bge_m3_training_data
|
|
5
5
|
|
|
6
|
+
CADET_CITATION = """@article{tamber2025conventionalcontrastivelearningfalls,
|
|
7
|
+
title={Conventional Contrastive Learning Often Falls Short: Improving Dense Retrieval with Cross-Encoder Listwise Distillation and Synthetic Data},
|
|
8
|
+
author={Manveer Singh Tamber and Suleman Kazi and Vivek Sourabh and Jimmy Lin},
|
|
9
|
+
journal={arXiv:2505.19274},
|
|
10
|
+
year={2025}
|
|
11
|
+
}"""
|
|
12
|
+
|
|
6
13
|
cadet_training_data = {
|
|
7
14
|
# we train with the corpora of FEVER, MSMARCO, and DBPEDIA. We only train with synthetic generated queries.
|
|
8
15
|
# However, we do use queries from MSMARCO as examples for synthetic query generation.
|
|
@@ -46,4 +53,5 @@ cadet_embed = ModelMeta(
|
|
|
46
53
|
public_training_data="https://github.com/manveertamber/cadet-dense-retrieval",
|
|
47
54
|
training_datasets=cadet_training_data,
|
|
48
55
|
adapted_from="intfloat/e5-base-unsupervised",
|
|
56
|
+
citation=CADET_CITATION,
|
|
49
57
|
)
|
|
@@ -24,6 +24,16 @@ if TYPE_CHECKING:
|
|
|
24
24
|
)
|
|
25
25
|
logger = logging.getLogger(__name__)
|
|
26
26
|
|
|
27
|
+
CDE_CITATION = """@misc{morris2024contextualdocumentembeddings,
|
|
28
|
+
title={Contextual Document Embeddings},
|
|
29
|
+
author={John X. Morris and Alexander M. Rush},
|
|
30
|
+
year={2024},
|
|
31
|
+
eprint={2410.02525},
|
|
32
|
+
archivePrefix={arXiv},
|
|
33
|
+
primaryClass={cs.CL},
|
|
34
|
+
url={https://arxiv.org/abs/2410.02525},
|
|
35
|
+
}"""
|
|
36
|
+
|
|
27
37
|
|
|
28
38
|
class CDEWrapper(SentenceTransformerEncoderWrapper):
|
|
29
39
|
dataset_embeddings: torch.Tensor | None = None
|
|
@@ -217,6 +227,7 @@ cde_small_v1 = ModelMeta(
|
|
|
217
227
|
training_datasets=bge_full_data,
|
|
218
228
|
public_training_code="https://github.com/jxmorris12/cde",
|
|
219
229
|
public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
|
|
230
|
+
citation=CDE_CITATION,
|
|
220
231
|
)
|
|
221
232
|
|
|
222
233
|
cde_small_v2 = ModelMeta(
|
|
@@ -244,4 +255,5 @@ cde_small_v2 = ModelMeta(
|
|
|
244
255
|
training_datasets=bge_full_data,
|
|
245
256
|
public_training_code="https://github.com/jxmorris12/cde",
|
|
246
257
|
public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
|
|
258
|
+
citation=CDE_CITATION,
|
|
247
259
|
)
|
|
@@ -2,6 +2,18 @@ from mteb.models import ModelMeta
|
|
|
2
2
|
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
3
3
|
from mteb.types import PromptType
|
|
4
4
|
|
|
5
|
+
F2LLM_CITATION = """@article{2025F2LLM,
|
|
6
|
+
title={F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data},
|
|
7
|
+
author={Ziyin Zhang and Zihan Liao and Hang Yu and Peng Di and Rui Wang},
|
|
8
|
+
journal={CoRR},
|
|
9
|
+
volume={abs/2510.02294},
|
|
10
|
+
year={2025},
|
|
11
|
+
url={https://doi.org/10.48550/arXiv.2510.02294},
|
|
12
|
+
doi={10.48550/ARXIV.2510.02294},
|
|
13
|
+
eprinttype={arXiv},
|
|
14
|
+
eprint={2510.02294}
|
|
15
|
+
}"""
|
|
16
|
+
|
|
5
17
|
training_datasets = {
|
|
6
18
|
"MSMARCO",
|
|
7
19
|
"ArguAna",
|
|
@@ -146,6 +158,7 @@ F2LLM_0B6 = ModelMeta(
|
|
|
146
158
|
public_training_code="https://github.com/codefuse-ai/F2LLM",
|
|
147
159
|
public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
|
|
148
160
|
training_datasets=training_datasets,
|
|
161
|
+
citation=F2LLM_CITATION,
|
|
149
162
|
)
|
|
150
163
|
|
|
151
164
|
F2LLM_1B7 = ModelMeta(
|
|
@@ -174,6 +187,7 @@ F2LLM_1B7 = ModelMeta(
|
|
|
174
187
|
public_training_code="https://github.com/codefuse-ai/F2LLM",
|
|
175
188
|
public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
|
|
176
189
|
training_datasets=training_datasets,
|
|
190
|
+
citation=F2LLM_CITATION,
|
|
177
191
|
)
|
|
178
192
|
|
|
179
193
|
F2LLM_4B = ModelMeta(
|
|
@@ -202,4 +216,5 @@ F2LLM_4B = ModelMeta(
|
|
|
202
216
|
public_training_code="https://github.com/codefuse-ai/F2LLM",
|
|
203
217
|
public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
|
|
204
218
|
training_datasets=training_datasets,
|
|
219
|
+
citation=F2LLM_CITATION,
|
|
205
220
|
)
|
|
@@ -1,6 +1,15 @@
|
|
|
1
1
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
2
2
|
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
|
|
3
3
|
|
|
4
|
+
CODESAGE_CITATION = """@inproceedings{
|
|
5
|
+
zhang2024code,
|
|
6
|
+
title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
|
|
7
|
+
author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
|
|
8
|
+
booktitle={The Twelfth International Conference on Learning Representations},
|
|
9
|
+
year={2024},
|
|
10
|
+
url={https://openreview.net/forum?id=vfzRRjumpX}
|
|
11
|
+
}"""
|
|
12
|
+
|
|
4
13
|
codesage_languages = [
|
|
5
14
|
"python-Code",
|
|
6
15
|
"javascript-Code",
|
|
@@ -33,6 +42,7 @@ codesage_large = ModelMeta(
|
|
|
33
42
|
"CodeSearchNetRetrieval",
|
|
34
43
|
"CodeSearchNetCCRetrieval",
|
|
35
44
|
},
|
|
45
|
+
citation=CODESAGE_CITATION,
|
|
36
46
|
)
|
|
37
47
|
|
|
38
48
|
codesage_base = ModelMeta(
|
|
@@ -58,6 +68,7 @@ codesage_base = ModelMeta(
|
|
|
58
68
|
"CodeSearchNetRetrieval",
|
|
59
69
|
"CodeSearchNetCCRetrieval",
|
|
60
70
|
},
|
|
71
|
+
citation=CODESAGE_CITATION,
|
|
61
72
|
)
|
|
62
73
|
|
|
63
74
|
codesage_small = ModelMeta(
|
|
@@ -83,4 +94,5 @@ codesage_small = ModelMeta(
|
|
|
83
94
|
"CodeSearchNetRetrieval",
|
|
84
95
|
"CodeSearchNetCCRetrieval",
|
|
85
96
|
},
|
|
97
|
+
citation=CODESAGE_CITATION,
|
|
86
98
|
)
|
|
@@ -221,7 +221,7 @@ class CohereTextEmbeddingModel(AbsEncoder):
|
|
|
221
221
|
) -> None:
|
|
222
222
|
import cohere # type: ignore
|
|
223
223
|
|
|
224
|
-
self.model_name = model_name.
|
|
224
|
+
self.model_name = model_name.removeprefix("Cohere/Cohere-")
|
|
225
225
|
self.sep = sep
|
|
226
226
|
self.model_prompts = self.validate_task_to_prompt_name(model_prompts)
|
|
227
227
|
if embedding_type not in get_args(EmbeddingType):
|
|
@@ -220,3 +220,60 @@ colnomic_7b = ModelMeta(
|
|
|
220
220
|
training_datasets=COLNOMIC_TRAINING_DATA,
|
|
221
221
|
citation=COLNOMIC_CITATION,
|
|
222
222
|
)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
EVOQWEN_TRAINING_DATA = {
|
|
226
|
+
"colpali_train_set",
|
|
227
|
+
"VisRAG-Ret-Train-Synthetic-data",
|
|
228
|
+
"VisRAG-Ret-Train-In-domain-data",
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
evoqwen25_vl_retriever_3b_v1 = ModelMeta(
|
|
232
|
+
loader=ColQwen2_5Wrapper,
|
|
233
|
+
loader_kwargs=dict(
|
|
234
|
+
torch_dtype=torch.float16, attn_implementation="flash_attention_2"
|
|
235
|
+
),
|
|
236
|
+
name="ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-3B-v1",
|
|
237
|
+
languages=["eng-Latn"],
|
|
238
|
+
revision="aeacaa2775f2758d82721eb1cf2f5daf1a392da9",
|
|
239
|
+
release_date="2025-11-04",
|
|
240
|
+
modalities=["image", "text"],
|
|
241
|
+
n_parameters=3_000_000_000,
|
|
242
|
+
memory_usage_mb=7200,
|
|
243
|
+
max_tokens=128000,
|
|
244
|
+
embed_dim=128,
|
|
245
|
+
license="apache-2.0",
|
|
246
|
+
open_weights=True,
|
|
247
|
+
public_training_code="https://github.com/illuin-tech/colpali",
|
|
248
|
+
public_training_data="https://huggingface.co/datasets/vidore/colpali_train_set",
|
|
249
|
+
framework=["ColPali"],
|
|
250
|
+
reference="https://huggingface.co/ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-3B-v1",
|
|
251
|
+
similarity_fn_name="MaxSim",
|
|
252
|
+
use_instructions=True,
|
|
253
|
+
training_datasets=EVOQWEN_TRAINING_DATA,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
evoqwen25_vl_retriever_7b_v1 = ModelMeta(
|
|
257
|
+
loader=ColQwen2_5Wrapper,
|
|
258
|
+
loader_kwargs=dict(
|
|
259
|
+
torch_dtype=torch.float16, attn_implementation="flash_attention_2"
|
|
260
|
+
),
|
|
261
|
+
name="ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-7B-v1",
|
|
262
|
+
languages=["eng-Latn"],
|
|
263
|
+
revision="8952ac6ee0e7de2e9211b165921518caf9202110",
|
|
264
|
+
release_date="2025-11-04",
|
|
265
|
+
modalities=["image", "text"],
|
|
266
|
+
n_parameters=7_000_000_000,
|
|
267
|
+
memory_usage_mb=14400,
|
|
268
|
+
max_tokens=128000,
|
|
269
|
+
embed_dim=128,
|
|
270
|
+
license="apache-2.0",
|
|
271
|
+
open_weights=True,
|
|
272
|
+
public_training_code="https://github.com/illuin-tech/colpali",
|
|
273
|
+
public_training_data="https://huggingface.co/datasets/vidore/colpali_train_set",
|
|
274
|
+
framework=["ColPali"],
|
|
275
|
+
reference="https://huggingface.co/ApsaraStackMaaS/EvoQwen2.5-VL-Retriever-7B-v1",
|
|
276
|
+
similarity_fn_name="MaxSim",
|
|
277
|
+
use_instructions=True,
|
|
278
|
+
training_datasets=EVOQWEN_TRAINING_DATA,
|
|
279
|
+
)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from mteb.models.model_meta import ModelMeta
|
|
2
|
+
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
|
|
3
|
+
|
|
4
|
+
embedding_gemma_300m_scandi = ModelMeta(
|
|
5
|
+
loader=sentence_transformers_loader, # type: ignore
|
|
6
|
+
name="emillykkejensen/EmbeddingGemma-Scandi-300m",
|
|
7
|
+
languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
|
|
8
|
+
open_weights=True,
|
|
9
|
+
revision="9f3307b9f601db564a9190cb475324d128dcfe86",
|
|
10
|
+
release_date="2025-10-17",
|
|
11
|
+
n_parameters=307_581_696,
|
|
12
|
+
embed_dim=768,
|
|
13
|
+
max_tokens=2048,
|
|
14
|
+
license="apache-2.0",
|
|
15
|
+
reference="https://huggingface.co/emillykkejensen/EmbeddingGemma-Scandi-300m",
|
|
16
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
17
|
+
use_instructions=True,
|
|
18
|
+
public_training_code=None,
|
|
19
|
+
public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
|
|
20
|
+
training_datasets=set(),
|
|
21
|
+
similarity_fn_name="cosine", # type: ignore[arg-type]
|
|
22
|
+
adapted_from="google/embeddinggemma-300m",
|
|
23
|
+
memory_usage_mb=578,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
qwen_scandi = ModelMeta(
|
|
28
|
+
loader=sentence_transformers_loader, # type: ignore
|
|
29
|
+
name="emillykkejensen/Qwen3-Embedding-Scandi-0.6B",
|
|
30
|
+
languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
|
|
31
|
+
open_weights=True,
|
|
32
|
+
revision="cf1e7ba36ebd3d605549d8f02930a18e17b54513",
|
|
33
|
+
release_date="2025-10-17",
|
|
34
|
+
n_parameters=595776512,
|
|
35
|
+
memory_usage_mb=2272,
|
|
36
|
+
embed_dim=1024,
|
|
37
|
+
max_tokens=32768,
|
|
38
|
+
license="apache-2.0",
|
|
39
|
+
reference="https://huggingface.co/emillykkejensen/Qwen3-Embedding-Scandi-0.6B",
|
|
40
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
41
|
+
use_instructions=True,
|
|
42
|
+
public_training_code=None,
|
|
43
|
+
public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
|
|
44
|
+
training_datasets=set(),
|
|
45
|
+
similarity_fn_name="cosine", # type: ignore[arg-type]
|
|
46
|
+
adapted_from="Qwen/Qwen3-Embedding-0.6B",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
mmbert_scandi = ModelMeta(
|
|
51
|
+
loader=sentence_transformers_loader, # type: ignore
|
|
52
|
+
name="emillykkejensen/mmBERTscandi-base-embedding",
|
|
53
|
+
languages=["dan-Latn", "swe-Latn", "nor-Latn", "nob-Latn", "nno-Latn"],
|
|
54
|
+
open_weights=True,
|
|
55
|
+
revision="82d74c7a5d8e1ddf31b132865df2d16b2b0294ee",
|
|
56
|
+
release_date="2025-10-17",
|
|
57
|
+
n_parameters=306939648,
|
|
58
|
+
memory_usage_mb=1171,
|
|
59
|
+
embed_dim=768,
|
|
60
|
+
max_tokens=8192,
|
|
61
|
+
license="apache-2.0",
|
|
62
|
+
reference="https://huggingface.co/emillykkejensen/Qwen3-Embedding-Scandi-0.6B",
|
|
63
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
64
|
+
use_instructions=True,
|
|
65
|
+
public_training_code=None,
|
|
66
|
+
public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
|
|
67
|
+
training_datasets=set(),
|
|
68
|
+
similarity_fn_name="cosine", # type: ignore[arg-type]
|
|
69
|
+
adapted_from="jonasaise/scandmmBERT-base-scandinavian",
|
|
70
|
+
)
|
|
@@ -39,7 +39,7 @@ class Encoder(torch.nn.Module):
|
|
|
39
39
|
self.max_length = max_length
|
|
40
40
|
self.normalize = normalize
|
|
41
41
|
self.processor.tokenizer.padding_side = "right"
|
|
42
|
-
self.
|
|
42
|
+
self.default_instruction = "You are a helpful assistant."
|
|
43
43
|
|
|
44
44
|
def forward(
|
|
45
45
|
self,
|
|
@@ -103,7 +103,7 @@ class Encoder(torch.nn.Module):
|
|
|
103
103
|
instruction=None,
|
|
104
104
|
**kwargs,
|
|
105
105
|
):
|
|
106
|
-
instruction = instruction or self.
|
|
106
|
+
instruction = instruction or self.default_instruction
|
|
107
107
|
# Inputs must be batched
|
|
108
108
|
input_texts, input_images = [], []
|
|
109
109
|
for t, i in zip(texts, images):
|
|
@@ -79,7 +79,7 @@ granite_training_data = {
|
|
|
79
79
|
"MIRACLReranking",
|
|
80
80
|
# Multilingual MrTydi Triples
|
|
81
81
|
"MrTidyRetrieval",
|
|
82
|
-
# Sadeeem Question
|
|
82
|
+
# Sadeeem Question Answering
|
|
83
83
|
# DBPedia Title-Body Pairs
|
|
84
84
|
"DBPedia",
|
|
85
85
|
"DBPedia-NL", # translated from hotpotQA (not trained on)
|
|
@@ -4,7 +4,7 @@ from mteb.models.model_meta import (
|
|
|
4
4
|
)
|
|
5
5
|
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
inf_retriever_v1_training_data = {
|
|
8
8
|
# eng_Latn
|
|
9
9
|
"ArguAna",
|
|
10
10
|
"CQADupstackRetrieval",
|
|
@@ -66,7 +66,7 @@ inf_retriever_v1 = ModelMeta(
|
|
|
66
66
|
adapted_from="Alibaba-NLP/gte-Qwen2-7B-instruct",
|
|
67
67
|
public_training_code=None,
|
|
68
68
|
public_training_data=None,
|
|
69
|
-
training_datasets=
|
|
69
|
+
training_datasets=inf_retriever_v1_training_data,
|
|
70
70
|
citation=INF_RETRIEVER_CITATION,
|
|
71
71
|
)
|
|
72
72
|
|
|
@@ -92,6 +92,6 @@ inf_retriever_v1_1_5b = ModelMeta(
|
|
|
92
92
|
adapted_from="Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
|
93
93
|
public_training_code=None,
|
|
94
94
|
public_training_data=None,
|
|
95
|
-
training_datasets=
|
|
95
|
+
training_datasets=inf_retriever_v1_training_data,
|
|
96
96
|
citation=INF_RETRIEVER_CITATION,
|
|
97
97
|
)
|