mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +4 -0
- mteb/_create_dataloaders.py +6 -3
- mteb/_evaluators/any_sts_evaluator.py +21 -12
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +1 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
- mteb/_evaluators/pair_classification_evaluator.py +30 -38
- mteb/_evaluators/sklearn_evaluator.py +15 -28
- mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
- mteb/_evaluators/text/summarization_evaluator.py +4 -2
- mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +102 -0
- mteb/abstasks/_statistics_calculation.py +6 -2
- mteb/abstasks/classification.py +0 -2
- mteb/abstasks/clustering.py +1 -1
- mteb/abstasks/clustering_legacy.py +3 -0
- mteb/abstasks/multilabel_classification.py +10 -3
- mteb/abstasks/pair_classification.py +8 -1
- mteb/abstasks/sts.py +7 -0
- mteb/abstasks/task_metadata.py +1 -0
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +74 -15
- mteb/benchmarks/benchmarks/__init__.py +8 -0
- mteb/benchmarks/benchmarks/benchmarks.py +259 -15
- mteb/benchmarks/get_benchmark.py +2 -0
- mteb/cache.py +47 -10
- mteb/deprecated_evaluator.py +8 -13
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/evaluate.py +65 -45
- mteb/leaderboard/app.py +268 -133
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +21 -17
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
- mteb/models/cache_wrappers/cache_wrapper.py +1 -1
- mteb/models/get_model_meta.py +3 -114
- mteb/models/instruct_wrapper.py +5 -1
- mteb/models/model_implementations/align_models.py +7 -0
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +8 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +60 -0
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +11 -0
- mteb/models/model_implementations/blip_models.py +27 -0
- mteb/models/model_implementations/bm25.py +1 -0
- mteb/models/model_implementations/bmretriever_models.py +4 -0
- mteb/models/model_implementations/cadet_models.py +9 -0
- mteb/models/model_implementations/cde_models.py +14 -0
- mteb/models/model_implementations/clip_models.py +3 -0
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +162 -0
- mteb/models/model_implementations/codesage_models.py +15 -0
- mteb/models/model_implementations/cohere_models.py +8 -1
- mteb/models/model_implementations/cohere_v.py +5 -0
- mteb/models/model_implementations/colpali_models.py +14 -6
- mteb/models/model_implementations/colqwen_models.py +271 -1
- mteb/models/model_implementations/colsmol_models.py +2 -0
- mteb/models/model_implementations/conan_models.py +1 -0
- mteb/models/model_implementations/dino_models.py +171 -0
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +12 -101
- mteb/models/model_implementations/e5_v.py +1 -0
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +58 -0
- mteb/models/model_implementations/facebookai.py +193 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +11 -5
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
- mteb/models/model_implementations/gritlm_models.py +2 -0
- mteb/models/model_implementations/gte_models.py +78 -0
- mteb/models/model_implementations/hinvec_models.py +1 -0
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +255 -2
- mteb/models/model_implementations/jina_clip.py +1 -0
- mteb/models/model_implementations/jina_models.py +209 -5
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +31 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +3 -2
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +3 -0
- mteb/models/model_implementations/llm2vec_models.py +8 -0
- mteb/models/model_implementations/mcinext_models.py +3 -0
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +362 -0
- mteb/models/model_implementations/mme5_models.py +1 -0
- mteb/models/model_implementations/moco_models.py +11 -0
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +13 -0
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/mxbai_models.py +9 -0
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
- mteb/models/model_implementations/nomic_models.py +156 -4
- mteb/models/model_implementations/nomic_models_vision.py +7 -2
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
- mteb/models/model_implementations/nvidia_models.py +4 -1
- mteb/models/model_implementations/octen_models.py +195 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +24 -0
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
- mteb/models/model_implementations/ops_moa_models.py +4 -2
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +8 -0
- mteb/models/model_implementations/promptriever_models.py +8 -4
- mteb/models/model_implementations/pylate_models.py +37 -4
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +6 -3
- mteb/models/model_implementations/qzhou_models.py +3 -1
- mteb/models/model_implementations/random_baseline.py +16 -21
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +1 -0
- mteb/models/model_implementations/repllama_models.py +2 -0
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +51 -0
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +3 -0
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +57 -0
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +10 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/ua_sentence_models.py +10 -0
- mteb/models/model_implementations/uae_models.py +1 -0
- mteb/models/model_implementations/vdr_models.py +2 -0
- mteb/models/model_implementations/vi_vn_models.py +39 -0
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +2 -0
- mteb/models/model_implementations/voyage_models.py +15 -0
- mteb/models/model_implementations/voyage_v.py +8 -2
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +442 -22
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
- mteb/models/search_wrappers.py +165 -48
- mteb/models/sentence_transformer_wrapper.py +2 -7
- mteb/results/benchmark_results.py +88 -47
- mteb/results/model_result.py +11 -4
- mteb/results/task_result.py +37 -19
- mteb/similarity_functions.py +49 -0
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +1 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +2 -1
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +22 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/_encoder_io.py +7 -2
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import math
|
|
3
|
-
from typing import Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
4
6
|
|
|
5
7
|
import torch
|
|
6
|
-
from PIL import Image
|
|
7
8
|
from torch.utils.data import DataLoader
|
|
8
9
|
from tqdm.autonotebook import tqdm
|
|
9
10
|
|
|
@@ -12,6 +13,9 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
12
13
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
13
14
|
from mteb.types import Array, BatchedInput, PromptType
|
|
14
15
|
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from PIL import Image
|
|
18
|
+
|
|
15
19
|
logger = logging.getLogger(__name__)
|
|
16
20
|
|
|
17
21
|
GME_CITATION = """@misc{zhang2024gme,
|
|
@@ -267,9 +271,9 @@ def smart_resize(
|
|
|
267
271
|
return h_bar, w_bar
|
|
268
272
|
|
|
269
273
|
|
|
270
|
-
def fetch_image(
|
|
271
|
-
|
|
272
|
-
|
|
274
|
+
def fetch_image(image: Image.Image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
|
|
275
|
+
from PIL import Image
|
|
276
|
+
|
|
273
277
|
image_obj = None
|
|
274
278
|
if isinstance(image, Image.Image):
|
|
275
279
|
image_obj = image
|
|
@@ -342,6 +346,7 @@ training_data = {
|
|
|
342
346
|
gme_qwen2vl_2b = ModelMeta(
|
|
343
347
|
loader=GmeQwen2VL,
|
|
344
348
|
name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
|
|
349
|
+
model_type=["dense"],
|
|
345
350
|
languages=["eng-Latn", "cmn-Hans"],
|
|
346
351
|
open_weights=True,
|
|
347
352
|
revision="ce765ae71b8cdb208203cd8fb64a170b1b84293a",
|
|
@@ -365,6 +370,7 @@ gme_qwen2vl_2b = ModelMeta(
|
|
|
365
370
|
gme_qwen2vl_7b = ModelMeta(
|
|
366
371
|
loader=GmeQwen2VL,
|
|
367
372
|
name="Alibaba-NLP/gme-Qwen2-VL-7B-Instruct",
|
|
373
|
+
model_type=["dense"],
|
|
368
374
|
languages=["eng-Latn", "cmn-Hans"],
|
|
369
375
|
open_weights=True,
|
|
370
376
|
revision="477027a6480f8630363be77751f169cc3434b673",
|
|
@@ -147,10 +147,10 @@ class GoogleTextEmbeddingModel(AbsEncoder):
|
|
|
147
147
|
google_text_emb_004 = ModelMeta(
|
|
148
148
|
loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
|
|
149
149
|
loader_kwargs=dict(
|
|
150
|
-
model_name="text-embedding-004",
|
|
151
150
|
model_prompts=MODEL_PROMPTS,
|
|
152
151
|
),
|
|
153
152
|
name="google/text-embedding-004",
|
|
153
|
+
model_type=["dense"],
|
|
154
154
|
languages=["eng-Latn"],
|
|
155
155
|
open_weights=False,
|
|
156
156
|
revision="1", # revision is intended for implementation
|
|
@@ -172,10 +172,10 @@ google_text_emb_004 = ModelMeta(
|
|
|
172
172
|
google_text_emb_005 = ModelMeta(
|
|
173
173
|
loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
|
|
174
174
|
loader_kwargs=dict(
|
|
175
|
-
model_name="text-embedding-005",
|
|
176
175
|
model_prompts=MODEL_PROMPTS,
|
|
177
176
|
),
|
|
178
177
|
name="google/text-embedding-005",
|
|
178
|
+
model_type=["dense"],
|
|
179
179
|
languages=["eng-Latn"],
|
|
180
180
|
open_weights=False,
|
|
181
181
|
revision="1", # revision is intended for implementation
|
|
@@ -197,10 +197,10 @@ google_text_emb_005 = ModelMeta(
|
|
|
197
197
|
google_text_multilingual_emb_002 = ModelMeta(
|
|
198
198
|
loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
|
|
199
199
|
loader_kwargs=dict(
|
|
200
|
-
model_name="text-embedding-002",
|
|
201
200
|
model_prompts=MODEL_PROMPTS,
|
|
202
201
|
),
|
|
203
202
|
name="google/text-multilingual-embedding-002",
|
|
203
|
+
model_type=["dense"],
|
|
204
204
|
languages=MULTILINGUAL_EVALUATED_LANGUAGES, # From the list of evaluated languages in https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text-embeddings-api#supported_text_languages
|
|
205
205
|
open_weights=False,
|
|
206
206
|
revision="1",
|
|
@@ -222,10 +222,10 @@ google_text_multilingual_emb_002 = ModelMeta(
|
|
|
222
222
|
google_gemini_embedding_001 = ModelMeta(
|
|
223
223
|
loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
|
|
224
224
|
loader_kwargs=dict(
|
|
225
|
-
model_name="gemini-embedding-001",
|
|
226
225
|
model_prompts=MODEL_PROMPTS,
|
|
227
226
|
),
|
|
228
227
|
name="google/gemini-embedding-001",
|
|
228
|
+
model_type=["dense"],
|
|
229
229
|
languages=MULTILINGUAL_EVALUATED_LANGUAGES,
|
|
230
230
|
open_weights=False,
|
|
231
231
|
revision="1",
|
|
@@ -260,6 +260,7 @@ def gemma_embedding_loader(model_name: str, revision: str, **kwargs):
|
|
|
260
260
|
embedding_gemma_300m = ModelMeta(
|
|
261
261
|
loader=gemma_embedding_loader,
|
|
262
262
|
name="google/embeddinggemma-300m",
|
|
263
|
+
model_type=["dense"],
|
|
263
264
|
languages=MULTILINGUAL_EVALUATED_LANGUAGES,
|
|
264
265
|
open_weights=True,
|
|
265
266
|
revision="64614b0b8b64f0c6c1e52b07e4e9a4e8fe4d2da2",
|
|
@@ -275,5 +276,15 @@ embedding_gemma_300m = ModelMeta(
|
|
|
275
276
|
public_training_data=None,
|
|
276
277
|
training_datasets=GECKO_TRAINING_DATA,
|
|
277
278
|
similarity_fn_name="cosine",
|
|
278
|
-
memory_usage_mb=
|
|
279
|
+
memory_usage_mb=1155,
|
|
280
|
+
citation="""
|
|
281
|
+
@misc{vera2025embeddinggemmapowerfullightweighttext,
|
|
282
|
+
title={EmbeddingGemma: Powerful and Lightweight Text Representations},
|
|
283
|
+
author={Henrique Schechter Vera and Sahil Dua and Biao Zhang and Daniel Salz and Ryan Mullins and Sindhu Raghuram Panyam and Sara Smoot and Iftekhar Naim and Joe Zou and Feiyang Chen and Daniel Cer and Alice Lisak and Min Choi and Lucas Gonzalez and Omar Sanseviero and Glenn Cameron and Ian Ballantyne and Kat Black and Kaifeng Chen and Weiyi Wang and Zhe Li and Gus Martins and Jinhyuk Lee and Mark Sherwood and Juyeong Ji and Renjie Wu and Jingxiao Zheng and Jyotinder Singh and Abheesht Sharma and Divyashree Sreepathihalli and Aashi Jain and Adham Elarabawy and AJ Co and Andreas Doumanoglou and Babak Samari and Ben Hora and Brian Potetz and Dahun Kim and Enrique Alfonseca and Fedor Moiseev and Feng Han and Frank Palma Gomez and Gustavo Hernández Ábrego and Hesen Zhang and Hui Hui and Jay Han and Karan Gill and Ke Chen and Koert Chen and Madhuri Shanbhogue and Michael Boratko and Paul Suganthan and Sai Meher Karthik Duddu and Sandeep Mariserla and Setareh Ariafar and Shanfeng Zhang and Shijie Zhang and Simon Baumgartner and Sonam Goenka and Steve Qiu and Tanmaya Dabral and Trevor Walker and Vikram Rao and Waleed Khawaja and Wenlei Zhou and Xiaoqi Ren and Ye Xia and Yichang Chen and Yi-Ting Chen and Zhe Dong and Zhongli Ding and Francesco Visin and Gaël Liu and Jiageng Zhang and Kathleen Kenealy and Michelle Casbon and Ravin Kumar and Thomas Mesnard and Zach Gleicher and Cormac Brick and Olivier Lacombe and Adam Roberts and Qin Yin and Yunhsuan Sung and Raphael Hoffmann and Tris Warkentin and Armand Joulin and Tom Duerig and Mojtaba Seyedhosseini},
|
|
284
|
+
year={2025},
|
|
285
|
+
eprint={2509.20354},
|
|
286
|
+
archivePrefix={arXiv},
|
|
287
|
+
primaryClass={cs.CL},
|
|
288
|
+
url={https://arxiv.org/abs/2509.20354},
|
|
289
|
+
}""",
|
|
279
290
|
)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from PIL import Image
|
|
6
7
|
from torch.utils.data import DataLoader
|
|
7
8
|
from tqdm.auto import tqdm
|
|
8
9
|
|
|
@@ -15,6 +16,9 @@ from mteb.types import Array, BatchedInput, PromptType
|
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
18
|
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from PIL import Image
|
|
21
|
+
|
|
18
22
|
|
|
19
23
|
class GraniteVisionEmbeddingWrapper:
|
|
20
24
|
def __init__(
|
|
@@ -162,6 +166,7 @@ granite_vision_embedding = ModelMeta(
|
|
|
162
166
|
torch_dtype=torch.float16,
|
|
163
167
|
),
|
|
164
168
|
name="ibm-granite/granite-vision-3.3-2b-embedding",
|
|
169
|
+
model_type=["dense"],
|
|
165
170
|
languages=["eng-Latn"],
|
|
166
171
|
revision="cee615db64d89d1552a4ee39c50f25c0fc5c66ca",
|
|
167
172
|
release_date="2025-06-11",
|
|
@@ -38,6 +38,7 @@ gritlm7b = ModelMeta(
|
|
|
38
38
|
torch_dtype="auto",
|
|
39
39
|
),
|
|
40
40
|
name="GritLM/GritLM-7B",
|
|
41
|
+
model_type=["dense"],
|
|
41
42
|
languages=["eng-Latn", "fra-Latn", "deu-Latn", "ita-Latn", "spa-Latn"],
|
|
42
43
|
open_weights=True,
|
|
43
44
|
revision="13f00a0e36500c80ce12870ea513846a066004af",
|
|
@@ -66,6 +67,7 @@ gritlm8x7b = ModelMeta(
|
|
|
66
67
|
torch_dtype="auto",
|
|
67
68
|
),
|
|
68
69
|
name="GritLM/GritLM-8x7B",
|
|
70
|
+
model_type=["dense"],
|
|
69
71
|
languages=["eng-Latn", "fra-Latn", "deu-Latn", "ita-Latn", "spa-Latn"],
|
|
70
72
|
open_weights=True,
|
|
71
73
|
revision="7f089b13e3345510281733ca1e6ff871b5b4bc76",
|
|
@@ -42,6 +42,7 @@ gte_qwen2_7b_instruct = ModelMeta(
|
|
|
42
42
|
embed_eos="<|endoftext|>",
|
|
43
43
|
),
|
|
44
44
|
name="Alibaba-NLP/gte-Qwen2-7B-instruct",
|
|
45
|
+
model_type=["dense"],
|
|
45
46
|
languages=None,
|
|
46
47
|
open_weights=True,
|
|
47
48
|
revision="e26182b2122f4435e8b3ebecbf363990f409b45b",
|
|
@@ -73,6 +74,7 @@ gte_qwen1_5_7b_instruct = ModelMeta(
|
|
|
73
74
|
embed_eos="<|endoftext|>",
|
|
74
75
|
),
|
|
75
76
|
name="Alibaba-NLP/gte-Qwen1.5-7B-instruct",
|
|
77
|
+
model_type=["dense"],
|
|
76
78
|
languages=["eng-Latn"],
|
|
77
79
|
open_weights=True,
|
|
78
80
|
revision="07d27e5226328010336563bc1b564a5e3436a298",
|
|
@@ -89,6 +91,12 @@ gte_qwen1_5_7b_instruct = ModelMeta(
|
|
|
89
91
|
public_training_code=None,
|
|
90
92
|
public_training_data=None,
|
|
91
93
|
training_datasets=None,
|
|
94
|
+
citation="""@article{li2023towards,
|
|
95
|
+
title={Towards general text embeddings with multi-stage contrastive learning},
|
|
96
|
+
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
|
|
97
|
+
journal={arXiv preprint arXiv:2308.03281},
|
|
98
|
+
year={2023}
|
|
99
|
+
}""",
|
|
92
100
|
)
|
|
93
101
|
|
|
94
102
|
gte_qwen2_1_5b_instruct = ModelMeta(
|
|
@@ -103,6 +111,7 @@ gte_qwen2_1_5b_instruct = ModelMeta(
|
|
|
103
111
|
embed_eos="<|endoftext|>",
|
|
104
112
|
),
|
|
105
113
|
name="Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
|
114
|
+
model_type=["dense"],
|
|
106
115
|
languages=["eng-Latn"],
|
|
107
116
|
open_weights=True,
|
|
108
117
|
revision="c6c1b92f4a3e1b92b326ad29dd3c8433457df8dd",
|
|
@@ -119,11 +128,18 @@ gte_qwen2_1_5b_instruct = ModelMeta(
|
|
|
119
128
|
public_training_code=None,
|
|
120
129
|
public_training_data=None,
|
|
121
130
|
training_datasets=None,
|
|
131
|
+
citation="""@article{li2023towards,
|
|
132
|
+
title={Towards general text embeddings with multi-stage contrastive learning},
|
|
133
|
+
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
|
|
134
|
+
journal={arXiv preprint arXiv:2308.03281},
|
|
135
|
+
year={2023}
|
|
136
|
+
}""",
|
|
122
137
|
)
|
|
123
138
|
|
|
124
139
|
gte_small_zh = ModelMeta(
|
|
125
140
|
loader=sentence_transformers_loader,
|
|
126
141
|
name="thenlper/gte-small-zh",
|
|
142
|
+
model_type=["dense"],
|
|
127
143
|
languages=["zho-Hans"],
|
|
128
144
|
open_weights=True,
|
|
129
145
|
revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a",
|
|
@@ -140,11 +156,18 @@ gte_small_zh = ModelMeta(
|
|
|
140
156
|
public_training_code=None,
|
|
141
157
|
public_training_data=None,
|
|
142
158
|
training_datasets=None, # Not disclosed
|
|
159
|
+
citation="""@article{li2023towards,
|
|
160
|
+
title={Towards general text embeddings with multi-stage contrastive learning},
|
|
161
|
+
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
|
|
162
|
+
journal={arXiv preprint arXiv:2308.03281},
|
|
163
|
+
year={2023}
|
|
164
|
+
}""",
|
|
143
165
|
)
|
|
144
166
|
|
|
145
167
|
gte_base_zh = ModelMeta(
|
|
146
168
|
loader=sentence_transformers_loader,
|
|
147
169
|
name="thenlper/gte-base-zh",
|
|
170
|
+
model_type=["dense"],
|
|
148
171
|
languages=["zho-Hans"],
|
|
149
172
|
open_weights=True,
|
|
150
173
|
revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c",
|
|
@@ -161,11 +184,18 @@ gte_base_zh = ModelMeta(
|
|
|
161
184
|
public_training_code=None,
|
|
162
185
|
public_training_data=None,
|
|
163
186
|
training_datasets=None, # Not disclosed
|
|
187
|
+
citation="""@article{li2023towards,
|
|
188
|
+
title={Towards general text embeddings with multi-stage contrastive learning},
|
|
189
|
+
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
|
|
190
|
+
journal={arXiv preprint arXiv:2308.03281},
|
|
191
|
+
year={2023}
|
|
192
|
+
}""",
|
|
164
193
|
)
|
|
165
194
|
|
|
166
195
|
gte_large_zh = ModelMeta(
|
|
167
196
|
loader=sentence_transformers_loader,
|
|
168
197
|
name="thenlper/gte-large-zh",
|
|
198
|
+
model_type=["dense"],
|
|
169
199
|
languages=["zho-Hans"],
|
|
170
200
|
open_weights=True,
|
|
171
201
|
revision="64c364e579de308104a9b2c170ca009502f4f545",
|
|
@@ -182,6 +212,12 @@ gte_large_zh = ModelMeta(
|
|
|
182
212
|
public_training_code=None,
|
|
183
213
|
public_training_data=None,
|
|
184
214
|
training_datasets=None, # Not disclosed
|
|
215
|
+
citation="""@article{li2023towards,
|
|
216
|
+
title={Towards general text embeddings with multi-stage contrastive learning},
|
|
217
|
+
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
|
|
218
|
+
journal={arXiv preprint arXiv:2308.03281},
|
|
219
|
+
year={2023}
|
|
220
|
+
}""",
|
|
185
221
|
)
|
|
186
222
|
|
|
187
223
|
gte_multilingual_langs = [
|
|
@@ -288,6 +324,7 @@ gte_multi_training_data = {
|
|
|
288
324
|
gte_multilingual_base = ModelMeta(
|
|
289
325
|
loader=sentence_transformers_loader,
|
|
290
326
|
name="Alibaba-NLP/gte-multilingual-base",
|
|
327
|
+
model_type=["dense"],
|
|
291
328
|
languages=gte_multilingual_langs,
|
|
292
329
|
open_weights=True,
|
|
293
330
|
revision="ca1791e0bcc104f6db161f27de1340241b13c5a4",
|
|
@@ -304,11 +341,19 @@ gte_multilingual_base = ModelMeta(
|
|
|
304
341
|
public_training_code=None,
|
|
305
342
|
public_training_data=None, # couldn't find
|
|
306
343
|
training_datasets=gte_multi_training_data,
|
|
344
|
+
citation="""@inproceedings{zhang2024mgte,
|
|
345
|
+
title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
|
|
346
|
+
author={Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Wen and Dai, Ziqi and Tang, Jialong and Lin, Huan and Yang, Baosong and Xie, Pengjun and Huang, Fei and others},
|
|
347
|
+
booktitle={Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track},
|
|
348
|
+
pages={1393--1412},
|
|
349
|
+
year={2024}
|
|
350
|
+
}""",
|
|
307
351
|
)
|
|
308
352
|
|
|
309
353
|
gte_modernbert_base = ModelMeta(
|
|
310
354
|
loader=sentence_transformers_loader,
|
|
311
355
|
name="Alibaba-NLP/gte-modernbert-base",
|
|
356
|
+
model_type=["dense"],
|
|
312
357
|
languages=["eng-Latn"],
|
|
313
358
|
open_weights=True,
|
|
314
359
|
revision="7ca8b4ca700621b67618669f5378fe5f5820b8e4",
|
|
@@ -325,12 +370,27 @@ gte_modernbert_base = ModelMeta(
|
|
|
325
370
|
public_training_code=None, # couldn't find
|
|
326
371
|
public_training_data=None,
|
|
327
372
|
training_datasets=gte_multi_training_data, # English part of gte_multi_training_data,
|
|
373
|
+
citation="""@inproceedings{zhang2024mgte,
|
|
374
|
+
title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
|
|
375
|
+
author={Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Wen and Dai, Ziqi and Tang, Jialong and Lin, Huan and Yang, Baosong and Xie, Pengjun and Huang, Fei and others},
|
|
376
|
+
booktitle={Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track},
|
|
377
|
+
pages={1393--1412},
|
|
378
|
+
year={2024}
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
@article{li2023towards,
|
|
382
|
+
title={Towards general text embeddings with multi-stage contrastive learning},
|
|
383
|
+
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
|
|
384
|
+
journal={arXiv preprint arXiv:2308.03281},
|
|
385
|
+
year={2023}
|
|
386
|
+
}""",
|
|
328
387
|
)
|
|
329
388
|
|
|
330
389
|
|
|
331
390
|
gte_base_en_v15 = ModelMeta(
|
|
332
391
|
loader=sentence_transformers_loader,
|
|
333
392
|
name="Alibaba-NLP/gte-base-en-v1.5",
|
|
393
|
+
model_type=["dense"],
|
|
334
394
|
languages=["eng-Latn"],
|
|
335
395
|
open_weights=True,
|
|
336
396
|
revision="a829fd0e060bb84554da0dfd354d0de0f7712b7f", # can be any
|
|
@@ -349,4 +409,22 @@ gte_base_en_v15 = ModelMeta(
|
|
|
349
409
|
public_training_code=None,
|
|
350
410
|
public_training_data=None,
|
|
351
411
|
training_datasets=None,
|
|
412
|
+
citation="""@misc{zhang2024mgte,
|
|
413
|
+
title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},
|
|
414
|
+
author={Xin Zhang and Yanzhao Zhang and Dingkun Long and Wen Xie and Ziqi Dai and Jialong Tang and Huan Lin and Baosong Yang and Pengjun Xie and Fei Huang and Meishan Zhang and Wenjie Li and Min Zhang},
|
|
415
|
+
year={2024},
|
|
416
|
+
eprint={2407.19669},
|
|
417
|
+
archivePrefix={arXiv},
|
|
418
|
+
primaryClass={cs.CL},
|
|
419
|
+
url={https://arxiv.org/abs/2407.19669},
|
|
420
|
+
}
|
|
421
|
+
@misc{li2023gte,
|
|
422
|
+
title={Towards General Text Embeddings with Multi-stage Contrastive Learning},
|
|
423
|
+
author={Zehan Li and Xin Zhang and Yanzhao Zhang and Dingkun Long and Pengjun Xie and Meishan Zhang},
|
|
424
|
+
year={2023},
|
|
425
|
+
eprint={2308.03281},
|
|
426
|
+
archivePrefix={arXiv},
|
|
427
|
+
primaryClass={cs.CL},
|
|
428
|
+
url={https://arxiv.org/abs/2308.03281},
|
|
429
|
+
}""",
|
|
352
430
|
)
|
|
@@ -94,6 +94,7 @@ granite_training_data = {
|
|
|
94
94
|
granite_107m_multilingual = ModelMeta(
|
|
95
95
|
loader=sentence_transformers_loader,
|
|
96
96
|
name="ibm-granite/granite-embedding-107m-multilingual",
|
|
97
|
+
model_type=["dense"],
|
|
97
98
|
languages=GRANITE_LANGUAGES,
|
|
98
99
|
open_weights=True,
|
|
99
100
|
revision="47db56afe692f731540413c67dd818ff492277e7",
|
|
@@ -118,6 +119,7 @@ granite_107m_multilingual = ModelMeta(
|
|
|
118
119
|
granite_278m_multilingual = ModelMeta(
|
|
119
120
|
loader=sentence_transformers_loader,
|
|
120
121
|
name="ibm-granite/granite-embedding-278m-multilingual",
|
|
122
|
+
model_type=["dense"],
|
|
121
123
|
languages=GRANITE_LANGUAGES,
|
|
122
124
|
open_weights=True,
|
|
123
125
|
revision="84e3546b88b0cb69f8078608a1df558020bcbf1f",
|
|
@@ -142,6 +144,7 @@ granite_278m_multilingual = ModelMeta(
|
|
|
142
144
|
granite_30m_english = ModelMeta(
|
|
143
145
|
loader=sentence_transformers_loader,
|
|
144
146
|
name="ibm-granite/granite-embedding-30m-english",
|
|
147
|
+
model_type=["dense"],
|
|
145
148
|
languages=["eng-Latn"],
|
|
146
149
|
open_weights=True,
|
|
147
150
|
revision="eddbb57470f896b5f8e2bfcb823d8f0e2d2024a5",
|
|
@@ -166,6 +169,7 @@ granite_30m_english = ModelMeta(
|
|
|
166
169
|
granite_125m_english = ModelMeta(
|
|
167
170
|
loader=sentence_transformers_loader,
|
|
168
171
|
name="ibm-granite/granite-embedding-125m-english",
|
|
172
|
+
model_type=["dense"],
|
|
169
173
|
languages=["eng-Latn"],
|
|
170
174
|
open_weights=True,
|
|
171
175
|
revision="e48d3a5b47eaa18e3fe07d4676e187fd80f32730",
|
|
@@ -191,6 +195,7 @@ granite_125m_english = ModelMeta(
|
|
|
191
195
|
granite_english_r2 = ModelMeta(
|
|
192
196
|
loader=sentence_transformers_loader,
|
|
193
197
|
name="ibm-granite/granite-embedding-english-r2",
|
|
198
|
+
model_type=["dense"],
|
|
194
199
|
languages=["eng-Latn"],
|
|
195
200
|
open_weights=True,
|
|
196
201
|
revision="6e7b8ce0e76270394ac4669ba4bbd7133b60b7f9",
|
|
@@ -215,6 +220,7 @@ granite_english_r2 = ModelMeta(
|
|
|
215
220
|
granite_small_english_r2 = ModelMeta(
|
|
216
221
|
loader=sentence_transformers_loader,
|
|
217
222
|
name="ibm-granite/granite-embedding-small-english-r2",
|
|
223
|
+
model_type=["dense"],
|
|
218
224
|
languages=["eng-Latn"],
|
|
219
225
|
open_weights=True,
|
|
220
226
|
revision="54a8d2616a0844355a5164432d3f6dafb37b17a3",
|
|
@@ -50,6 +50,7 @@ inf_retriever_v1 = ModelMeta(
|
|
|
50
50
|
trust_remote_code=True,
|
|
51
51
|
),
|
|
52
52
|
name="infly/inf-retriever-v1",
|
|
53
|
+
model_type=["dense"],
|
|
53
54
|
languages=["eng-Latn", "zho-Hans"],
|
|
54
55
|
open_weights=True,
|
|
55
56
|
revision="cb70ca7c31dfa866b2eff2dad229c144d8ddfd91",
|
|
@@ -76,6 +77,7 @@ inf_retriever_v1_1_5b = ModelMeta(
|
|
|
76
77
|
trust_remote_code=True,
|
|
77
78
|
),
|
|
78
79
|
name="infly/inf-retriever-v1-1.5b",
|
|
80
|
+
model_type=["dense"],
|
|
79
81
|
languages=["eng-Latn", "zho-Hans"],
|
|
80
82
|
open_weights=True,
|
|
81
83
|
revision="c9c05c2dd50707a486966ba81703021ae2094a06",
|