mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +4 -0
- mteb/_create_dataloaders.py +6 -3
- mteb/_evaluators/any_sts_evaluator.py +21 -12
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +1 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
- mteb/_evaluators/pair_classification_evaluator.py +30 -38
- mteb/_evaluators/sklearn_evaluator.py +15 -28
- mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
- mteb/_evaluators/text/summarization_evaluator.py +4 -2
- mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +102 -0
- mteb/abstasks/_statistics_calculation.py +6 -2
- mteb/abstasks/classification.py +0 -2
- mteb/abstasks/clustering.py +1 -1
- mteb/abstasks/clustering_legacy.py +3 -0
- mteb/abstasks/multilabel_classification.py +10 -3
- mteb/abstasks/pair_classification.py +8 -1
- mteb/abstasks/sts.py +7 -0
- mteb/abstasks/task_metadata.py +1 -0
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +74 -15
- mteb/benchmarks/benchmarks/__init__.py +8 -0
- mteb/benchmarks/benchmarks/benchmarks.py +259 -15
- mteb/benchmarks/get_benchmark.py +2 -0
- mteb/cache.py +47 -10
- mteb/deprecated_evaluator.py +8 -13
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/evaluate.py +65 -45
- mteb/leaderboard/app.py +268 -133
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +21 -17
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
- mteb/models/cache_wrappers/cache_wrapper.py +1 -1
- mteb/models/get_model_meta.py +3 -114
- mteb/models/instruct_wrapper.py +5 -1
- mteb/models/model_implementations/align_models.py +7 -0
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +8 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +60 -0
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +11 -0
- mteb/models/model_implementations/blip_models.py +27 -0
- mteb/models/model_implementations/bm25.py +1 -0
- mteb/models/model_implementations/bmretriever_models.py +4 -0
- mteb/models/model_implementations/cadet_models.py +9 -0
- mteb/models/model_implementations/cde_models.py +14 -0
- mteb/models/model_implementations/clip_models.py +3 -0
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +162 -0
- mteb/models/model_implementations/codesage_models.py +15 -0
- mteb/models/model_implementations/cohere_models.py +8 -1
- mteb/models/model_implementations/cohere_v.py +5 -0
- mteb/models/model_implementations/colpali_models.py +14 -6
- mteb/models/model_implementations/colqwen_models.py +271 -1
- mteb/models/model_implementations/colsmol_models.py +2 -0
- mteb/models/model_implementations/conan_models.py +1 -0
- mteb/models/model_implementations/dino_models.py +171 -0
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +12 -101
- mteb/models/model_implementations/e5_v.py +1 -0
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +58 -0
- mteb/models/model_implementations/facebookai.py +193 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +11 -5
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
- mteb/models/model_implementations/gritlm_models.py +2 -0
- mteb/models/model_implementations/gte_models.py +78 -0
- mteb/models/model_implementations/hinvec_models.py +1 -0
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +255 -2
- mteb/models/model_implementations/jina_clip.py +1 -0
- mteb/models/model_implementations/jina_models.py +209 -5
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +31 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +3 -2
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +3 -0
- mteb/models/model_implementations/llm2vec_models.py +8 -0
- mteb/models/model_implementations/mcinext_models.py +3 -0
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +362 -0
- mteb/models/model_implementations/mme5_models.py +1 -0
- mteb/models/model_implementations/moco_models.py +11 -0
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +13 -0
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/mxbai_models.py +9 -0
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
- mteb/models/model_implementations/nomic_models.py +156 -4
- mteb/models/model_implementations/nomic_models_vision.py +7 -2
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
- mteb/models/model_implementations/nvidia_models.py +4 -1
- mteb/models/model_implementations/octen_models.py +195 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +24 -0
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
- mteb/models/model_implementations/ops_moa_models.py +4 -2
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +8 -0
- mteb/models/model_implementations/promptriever_models.py +8 -4
- mteb/models/model_implementations/pylate_models.py +37 -4
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +6 -3
- mteb/models/model_implementations/qzhou_models.py +3 -1
- mteb/models/model_implementations/random_baseline.py +16 -21
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +1 -0
- mteb/models/model_implementations/repllama_models.py +2 -0
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +51 -0
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +3 -0
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +57 -0
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +10 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/ua_sentence_models.py +10 -0
- mteb/models/model_implementations/uae_models.py +1 -0
- mteb/models/model_implementations/vdr_models.py +2 -0
- mteb/models/model_implementations/vi_vn_models.py +39 -0
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +2 -0
- mteb/models/model_implementations/voyage_models.py +15 -0
- mteb/models/model_implementations/voyage_v.py +8 -2
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +442 -22
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
- mteb/models/search_wrappers.py +165 -48
- mteb/models/sentence_transformer_wrapper.py +2 -7
- mteb/results/benchmark_results.py +88 -47
- mteb/results/model_result.py +11 -4
- mteb/results/task_result.py +37 -19
- mteb/similarity_functions.py +49 -0
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +1 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +2 -1
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +22 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/_encoder_io.py +7 -2
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
|
@@ -193,12 +193,13 @@ NOMIC_CITATION = """
|
|
|
193
193
|
"""
|
|
194
194
|
|
|
195
195
|
nomic_embed_v1_5 = ModelMeta(
|
|
196
|
-
loader=NomicWrapper,
|
|
196
|
+
loader=NomicWrapper, # type: ignore
|
|
197
197
|
loader_kwargs=dict(
|
|
198
198
|
trust_remote_code=True,
|
|
199
199
|
model_prompts=model_prompts,
|
|
200
200
|
),
|
|
201
201
|
name="nomic-ai/nomic-embed-text-v1.5",
|
|
202
|
+
model_type=["dense"],
|
|
202
203
|
languages=["eng-Latn"],
|
|
203
204
|
open_weights=True,
|
|
204
205
|
revision="b0753ae76394dd36bcfb912a46018088bca48be0",
|
|
@@ -221,12 +222,13 @@ nomic_embed_v1_5 = ModelMeta(
|
|
|
221
222
|
)
|
|
222
223
|
|
|
223
224
|
nomic_embed_v1 = ModelMeta(
|
|
224
|
-
loader=NomicWrapper,
|
|
225
|
+
loader=NomicWrapper, # type: ignore
|
|
225
226
|
loader_kwargs=dict(
|
|
226
227
|
trust_remote_code=True,
|
|
227
228
|
model_prompts=model_prompts,
|
|
228
229
|
),
|
|
229
230
|
name="nomic-ai/nomic-embed-text-v1",
|
|
231
|
+
model_type=["dense"],
|
|
230
232
|
languages=["eng-Latn"],
|
|
231
233
|
open_weights=True,
|
|
232
234
|
revision="0759316f275aa0cb93a5b830973843ca66babcf5",
|
|
@@ -249,12 +251,13 @@ nomic_embed_v1 = ModelMeta(
|
|
|
249
251
|
)
|
|
250
252
|
|
|
251
253
|
nomic_embed_v1_ablated = ModelMeta(
|
|
252
|
-
loader=NomicWrapper,
|
|
254
|
+
loader=NomicWrapper, # type: ignore
|
|
253
255
|
loader_kwargs=dict(
|
|
254
256
|
trust_remote_code=True,
|
|
255
257
|
model_prompts=model_prompts,
|
|
256
258
|
),
|
|
257
259
|
name="nomic-ai/nomic-embed-text-v1-ablated",
|
|
260
|
+
model_type=["dense"],
|
|
258
261
|
languages=["eng-Latn"],
|
|
259
262
|
open_weights=True,
|
|
260
263
|
revision="7d948905c5d5d3874fa55a925d68e49dbf411e5f",
|
|
@@ -276,12 +279,13 @@ nomic_embed_v1_ablated = ModelMeta(
|
|
|
276
279
|
)
|
|
277
280
|
|
|
278
281
|
nomic_embed_v1_unsupervised = ModelMeta(
|
|
279
|
-
loader=NomicWrapper,
|
|
282
|
+
loader=NomicWrapper, # type: ignore
|
|
280
283
|
loader_kwargs=dict(
|
|
281
284
|
trust_remote_code=True,
|
|
282
285
|
model_prompts=model_prompts,
|
|
283
286
|
),
|
|
284
287
|
name="nomic-ai/nomic-embed-text-v1-unsupervised",
|
|
288
|
+
model_type=["dense"],
|
|
285
289
|
languages=["eng-Latn"],
|
|
286
290
|
open_weights=True,
|
|
287
291
|
revision="b53d557b15ae63852847c222d336c1609eced93c",
|
|
@@ -309,6 +313,7 @@ nomic_modern_bert_embed = ModelMeta(
|
|
|
309
313
|
model_prompts=model_prompts,
|
|
310
314
|
),
|
|
311
315
|
name="nomic-ai/modernbert-embed-base",
|
|
316
|
+
model_type=["dense"],
|
|
312
317
|
languages=["eng-Latn"],
|
|
313
318
|
open_weights=True,
|
|
314
319
|
revision="5960f1566fb7cb1adf1eb6e816639cf4646d9b12",
|
|
@@ -328,4 +333,151 @@ nomic_modern_bert_embed = ModelMeta(
|
|
|
328
333
|
superseded_by=None,
|
|
329
334
|
training_datasets=nomic_training_data,
|
|
330
335
|
public_training_data=None,
|
|
336
|
+
citation="""@misc{nussbaum2024nomic,
|
|
337
|
+
title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
|
|
338
|
+
author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
|
|
339
|
+
year={2024},
|
|
340
|
+
eprint={2402.01613},
|
|
341
|
+
archivePrefix={arXiv},
|
|
342
|
+
primaryClass={cs.CL}
|
|
343
|
+
}""",
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
m_languages = [
|
|
348
|
+
"eng-Latn",
|
|
349
|
+
"spa-Latn",
|
|
350
|
+
"fra-Latn",
|
|
351
|
+
"deu-Latn",
|
|
352
|
+
"ita-Latn",
|
|
353
|
+
"por-Latn",
|
|
354
|
+
"pol-Latn",
|
|
355
|
+
"nld-Latn",
|
|
356
|
+
"tur-Latn",
|
|
357
|
+
"jpn-Jpan",
|
|
358
|
+
"vie-Latn",
|
|
359
|
+
"rus-Cyrl",
|
|
360
|
+
"ind-Latn",
|
|
361
|
+
"arb-Arab",
|
|
362
|
+
"ces-Latn",
|
|
363
|
+
"ron-Latn",
|
|
364
|
+
"swe-Latn",
|
|
365
|
+
"ell-Grek",
|
|
366
|
+
"ukr-Cyrl",
|
|
367
|
+
"zho-Hans",
|
|
368
|
+
"hun-Latn",
|
|
369
|
+
"dan-Latn",
|
|
370
|
+
"nor-Latn",
|
|
371
|
+
"hin-Deva",
|
|
372
|
+
"fin-Latn",
|
|
373
|
+
"bul-Cyrl",
|
|
374
|
+
"kor-Hang",
|
|
375
|
+
"slk-Latn",
|
|
376
|
+
"tha-Thai",
|
|
377
|
+
"heb-Hebr",
|
|
378
|
+
"cat-Latn",
|
|
379
|
+
"lit-Latn",
|
|
380
|
+
"fas-Arab",
|
|
381
|
+
"msa-Latn",
|
|
382
|
+
"slv-Latn",
|
|
383
|
+
"lav-Latn",
|
|
384
|
+
"mar-Deva",
|
|
385
|
+
"ben-Beng",
|
|
386
|
+
"sqi-Latn",
|
|
387
|
+
"cym-Latn",
|
|
388
|
+
"bel-Cyrl",
|
|
389
|
+
"mal-Mlym",
|
|
390
|
+
"kan-Knda",
|
|
391
|
+
"mkd-Cyrl",
|
|
392
|
+
"urd-Arab",
|
|
393
|
+
"fry-Latn",
|
|
394
|
+
"fil-Latn",
|
|
395
|
+
"tel-Telu",
|
|
396
|
+
"eus-Latn",
|
|
397
|
+
"swh-Latn",
|
|
398
|
+
"som-Latn",
|
|
399
|
+
"snd-Arab",
|
|
400
|
+
"uzb-Latn",
|
|
401
|
+
"cos-Latn",
|
|
402
|
+
"hrv-Latn",
|
|
403
|
+
"guj-Gujr",
|
|
404
|
+
"hin-Latn",
|
|
405
|
+
"ceb-Latn",
|
|
406
|
+
"epo-Latn",
|
|
407
|
+
"jav-Latn",
|
|
408
|
+
"lat-Latn",
|
|
409
|
+
"zul-Latn",
|
|
410
|
+
"mon-Cyrl",
|
|
411
|
+
"sin-Sinh",
|
|
412
|
+
"ell-Latn",
|
|
413
|
+
"gle-Latn",
|
|
414
|
+
"kir-Cyrl",
|
|
415
|
+
"tgk-Cyrl",
|
|
416
|
+
"mya-Mymr",
|
|
417
|
+
"khm-Khmr",
|
|
418
|
+
"mlg-Latn",
|
|
419
|
+
"pan-Guru",
|
|
420
|
+
"rus-Latn",
|
|
421
|
+
"sna-Latn",
|
|
422
|
+
"zho-Latn",
|
|
423
|
+
"hau-Latn",
|
|
424
|
+
"heb-Latn",
|
|
425
|
+
"hmn-Latn",
|
|
426
|
+
"hat-Latn",
|
|
427
|
+
"jpn-Latn",
|
|
428
|
+
"sun-Latn",
|
|
429
|
+
"bul-Latn",
|
|
430
|
+
"gla-Latn",
|
|
431
|
+
"nya-Latn",
|
|
432
|
+
"pus-Arab",
|
|
433
|
+
"kur-Latn",
|
|
434
|
+
"hbs-Latn",
|
|
435
|
+
"amh-Ethi",
|
|
436
|
+
"ibo-Latn",
|
|
437
|
+
"lao-Laoo",
|
|
438
|
+
"mri-Latn",
|
|
439
|
+
"nno-Latn",
|
|
440
|
+
"smo-Latn",
|
|
441
|
+
"yid-Hebr",
|
|
442
|
+
"sot-Latn",
|
|
443
|
+
"tgl-Latn",
|
|
444
|
+
"xho-Latn",
|
|
445
|
+
"yor-Latn",
|
|
446
|
+
]
|
|
447
|
+
|
|
448
|
+
nomic_embed_text_v2_moe = ModelMeta(
|
|
449
|
+
loader=NomicWrapper, # type: ignore
|
|
450
|
+
loader_kwargs=dict(
|
|
451
|
+
trust_remote_code=True,
|
|
452
|
+
model_prompts=model_prompts,
|
|
453
|
+
),
|
|
454
|
+
name="nomic-ai/nomic-embed-text-v2-moe",
|
|
455
|
+
model_type=["dense"],
|
|
456
|
+
languages=m_languages,
|
|
457
|
+
open_weights=True,
|
|
458
|
+
revision="1066b6599d099fbb93dfcb64f9c37a7c9e503e85",
|
|
459
|
+
release_date="2025-02-07",
|
|
460
|
+
n_parameters=475292928,
|
|
461
|
+
memory_usage_mb=1813,
|
|
462
|
+
max_tokens=512,
|
|
463
|
+
embed_dim=768,
|
|
464
|
+
license="apache-2.0",
|
|
465
|
+
reference="https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe",
|
|
466
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
467
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
468
|
+
use_instructions=True,
|
|
469
|
+
adapted_from="nomic-ai/nomic-xlm-2048",
|
|
470
|
+
public_training_data="https://github.com/nomic-ai/contrastors?tab=readme-ov-file#data-access",
|
|
471
|
+
public_training_code="https://github.com/nomic-ai/contrastors/blob/613ddfd37309e538cceadb05b1e6423e7b09f603/src/contrastors/configs/train/contrastive_finetune_moe.yaml",
|
|
472
|
+
training_datasets=None, # did not look into this further
|
|
473
|
+
superseded_by=None,
|
|
474
|
+
citation="""@misc{nussbaum2025trainingsparsemixtureexperts,
|
|
475
|
+
title={Training Sparse Mixture Of Experts Text Embedding Models},
|
|
476
|
+
author={Zach Nussbaum and Brandon Duderstadt},
|
|
477
|
+
year={2025},
|
|
478
|
+
eprint={2502.07972},
|
|
479
|
+
archivePrefix={arXiv},
|
|
480
|
+
primaryClass={cs.CL},
|
|
481
|
+
url={https://arxiv.org/abs/2502.07972},
|
|
482
|
+
}""",
|
|
331
483
|
)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
6
|
import torch.nn.functional as F
|
|
5
|
-
from PIL import Image
|
|
6
7
|
from torch.utils.data import DataLoader
|
|
7
8
|
from tqdm.auto import tqdm
|
|
8
9
|
|
|
@@ -12,6 +13,9 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
12
13
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
13
14
|
from mteb.types import Array, BatchedInput, PromptType
|
|
14
15
|
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from PIL import Image
|
|
18
|
+
|
|
15
19
|
NOMIC_EMBED_VISION_CITATION = """@article{nussbaum2024nomicembedvision,
|
|
16
20
|
title={Nomic Embed Vision: Expanding the Latent Space},
|
|
17
21
|
author={Nussbaum, Zach and Duderstadt, Brandon and Mulyar, Andriy},
|
|
@@ -164,6 +168,7 @@ nomic_embed_vision_v1_5 = ModelMeta(
|
|
|
164
168
|
"text_model_revision": "a03db6748c80237063eb0546ac6b627eca2318cb",
|
|
165
169
|
},
|
|
166
170
|
name="nomic-ai/nomic-embed-vision-v1.5",
|
|
171
|
+
model_type=["dense"],
|
|
167
172
|
languages=["eng-Latn"],
|
|
168
173
|
revision="af2246fffdab78d8458418480e4886a8e48b70a7",
|
|
169
174
|
release_date="2024-06-08",
|
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import TYPE_CHECKING, Any
|
|
2
2
|
|
|
3
3
|
import torch
|
|
4
|
-
from PIL import Image
|
|
5
4
|
from torch.utils.data import DataLoader
|
|
6
5
|
|
|
7
6
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
@@ -9,6 +8,10 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
9
8
|
from mteb.models.model_meta import ModelMeta
|
|
10
9
|
from mteb.types import Array, BatchedInput, PromptType
|
|
11
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
12
15
|
LLAMA_NEMORETRIEVER_CITATION = """@misc{xu2025llamanemoretrievercolembedtopperforming,
|
|
13
16
|
title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
|
|
14
17
|
author={Mengyao Xu and Gabriel Moreira and Ronay Ak and Radek Osmulski and Yauhen Babakhin and Zhiding Yu and Benedikt Schifferer and Even Oldridge},
|
|
@@ -53,6 +56,7 @@ class LlamaNemoretrieverColembed(AbsEncoder):
|
|
|
53
56
|
**kwargs,
|
|
54
57
|
):
|
|
55
58
|
import torchvision.transforms.functional as F
|
|
59
|
+
from PIL import Image
|
|
56
60
|
|
|
57
61
|
all_images = []
|
|
58
62
|
if isinstance(images, DataLoader):
|
|
@@ -61,14 +65,16 @@ class LlamaNemoretrieverColembed(AbsEncoder):
|
|
|
61
65
|
iterator = DataLoader(images, batch_size=batch_size)
|
|
62
66
|
|
|
63
67
|
for batch in iterator:
|
|
64
|
-
for
|
|
68
|
+
for image in batch["image"]:
|
|
65
69
|
pil_img = (
|
|
66
|
-
|
|
70
|
+
image
|
|
71
|
+
if isinstance(image, Image.Image)
|
|
72
|
+
else F.to_pil_image(image.to("cpu"))
|
|
67
73
|
)
|
|
68
74
|
all_images.append(pil_img)
|
|
69
75
|
|
|
70
76
|
batch_size = 1
|
|
71
|
-
return self.model.
|
|
77
|
+
return self.model.forward_images(all_images, batch_size=batch_size)
|
|
72
78
|
|
|
73
79
|
def calculate_probs(self, text_embeddings, image_embeddings):
|
|
74
80
|
scores = self.similarity(text_embeddings, image_embeddings)
|
|
@@ -117,19 +123,18 @@ class LlamaNemoretrieverColembed(AbsEncoder):
|
|
|
117
123
|
|
|
118
124
|
TRAINING_DATA = {
|
|
119
125
|
# from https://huggingface.co/datasets/vidore/colpali_train_set
|
|
120
|
-
"
|
|
121
|
-
"
|
|
122
|
-
"
|
|
123
|
-
"
|
|
124
|
-
"
|
|
125
|
-
"
|
|
126
|
+
"VidoreDocVQARetrieval",
|
|
127
|
+
"VidoreInfoVQARetrieval",
|
|
128
|
+
"VidoreTatdqaRetrieval",
|
|
129
|
+
"VidoreArxivQARetrieval",
|
|
130
|
+
"HotpotQA",
|
|
131
|
+
"MIRACLRetrieval",
|
|
126
132
|
"NQ",
|
|
127
|
-
"
|
|
133
|
+
"StackExchangeClustering",
|
|
128
134
|
"SQuAD",
|
|
129
135
|
"WebInstructSub",
|
|
130
136
|
"docmatix-ir",
|
|
131
|
-
"
|
|
132
|
-
"colpali_train_set", # as it contains PDFs
|
|
137
|
+
"VDRMultilingualRetrieval",
|
|
133
138
|
"VisRAG-Ret-Train-Synthetic-data",
|
|
134
139
|
"VisRAG-Ret-Train-In-domain-data",
|
|
135
140
|
"wiki-ss-nq",
|
|
@@ -141,12 +146,13 @@ llama_nemoretriever_colembed_1b_v1 = ModelMeta(
|
|
|
141
146
|
trust_remote_code=True,
|
|
142
147
|
),
|
|
143
148
|
name="nvidia/llama-nemoretriever-colembed-1b-v1",
|
|
149
|
+
model_type=["late-interaction"],
|
|
144
150
|
languages=["eng-Latn"],
|
|
145
151
|
revision="1f0fdea7f5b19532a750be109b19072d719b8177",
|
|
146
152
|
release_date="2025-06-27",
|
|
147
153
|
modalities=["image", "text"],
|
|
148
154
|
n_parameters=2_418_000_000,
|
|
149
|
-
memory_usage_mb=
|
|
155
|
+
memory_usage_mb=4610,
|
|
150
156
|
max_tokens=8192,
|
|
151
157
|
embed_dim=2048,
|
|
152
158
|
license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE",
|
|
@@ -167,12 +173,13 @@ llama_nemoretriever_colembed_3b_v1 = ModelMeta(
|
|
|
167
173
|
trust_remote_code=True,
|
|
168
174
|
),
|
|
169
175
|
name="nvidia/llama-nemoretriever-colembed-3b-v1",
|
|
176
|
+
model_type=["late-interaction"],
|
|
170
177
|
languages=["eng-Latn"],
|
|
171
178
|
revision="50c36f4d5271c6851aa08bd26d69f6e7ca8b870c",
|
|
172
179
|
release_date="2025-06-27",
|
|
173
180
|
modalities=["image", "text"],
|
|
174
181
|
n_parameters=4_407_000_000,
|
|
175
|
-
memory_usage_mb=
|
|
182
|
+
memory_usage_mb=8403,
|
|
176
183
|
max_tokens=8192,
|
|
177
184
|
embed_dim=3072,
|
|
178
185
|
license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE",
|
|
@@ -111,6 +111,7 @@ NV_embed_v2 = ModelMeta(
|
|
|
111
111
|
add_eos_token=True,
|
|
112
112
|
),
|
|
113
113
|
name="nvidia/NV-Embed-v2",
|
|
114
|
+
model_type=["dense"],
|
|
114
115
|
languages=["eng-Latn"],
|
|
115
116
|
open_weights=True,
|
|
116
117
|
revision="7604d305b621f14095a1aa23d351674c2859553a",
|
|
@@ -141,12 +142,13 @@ NV_embed_v1 = ModelMeta(
|
|
|
141
142
|
add_eos_token=True,
|
|
142
143
|
),
|
|
143
144
|
name="nvidia/NV-Embed-v1",
|
|
145
|
+
model_type=["dense"],
|
|
144
146
|
languages=["eng-Latn"],
|
|
145
147
|
open_weights=True,
|
|
146
148
|
revision="570834afd5fef5bf3a3c2311a2b6e0a66f6f4f2c",
|
|
147
149
|
release_date="2024-09-13", # initial commit of hf model.
|
|
148
150
|
n_parameters=7_850_000_000,
|
|
149
|
-
memory_usage_mb=
|
|
151
|
+
memory_usage_mb=14975,
|
|
150
152
|
embed_dim=4096,
|
|
151
153
|
license="cc-by-nc-4.0",
|
|
152
154
|
max_tokens=32768,
|
|
@@ -528,6 +530,7 @@ class LlamaEmbedNemotron(AbsEncoder):
|
|
|
528
530
|
llama_embed_nemotron_8b = ModelMeta(
|
|
529
531
|
loader=LlamaEmbedNemotron,
|
|
530
532
|
name="nvidia/llama-embed-nemotron-8b",
|
|
533
|
+
model_type=["dense"],
|
|
531
534
|
languages=llama_embed_nemotron_evaluated_languages,
|
|
532
535
|
open_weights=True,
|
|
533
536
|
revision="84a375593d27d3528beb4e104822515659e093b4",
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
2
|
+
from mteb.models.model_meta import ModelMeta
|
|
3
|
+
from mteb.models.models_protocols import PromptType
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def instruction_template(
|
|
7
|
+
instruction: str, prompt_type: PromptType | None = None
|
|
8
|
+
) -> str:
|
|
9
|
+
if (
|
|
10
|
+
prompt_type == PromptType.document
|
|
11
|
+
): # to avoid this issue: https://huggingface.co/Qwen/Qwen3-Embedding-8B/discussions/21
|
|
12
|
+
return " "
|
|
13
|
+
if not instruction:
|
|
14
|
+
return ""
|
|
15
|
+
if isinstance(instruction, dict):
|
|
16
|
+
if prompt_type is None:
|
|
17
|
+
instruction = next(iter(instruction.values())) # TODO
|
|
18
|
+
else:
|
|
19
|
+
instruction = instruction[prompt_type]
|
|
20
|
+
return f"Instruct: {instruction}\nQuery:"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
multilingual_langs = [
|
|
24
|
+
"afr-Latn",
|
|
25
|
+
"ara-Arab",
|
|
26
|
+
"aze-Latn",
|
|
27
|
+
"bel-Cyrl",
|
|
28
|
+
"bul-Cyrl",
|
|
29
|
+
"ben-Beng",
|
|
30
|
+
"cat-Latn",
|
|
31
|
+
"ceb-Latn",
|
|
32
|
+
"ces-Latn",
|
|
33
|
+
"cym-Latn",
|
|
34
|
+
"dan-Latn",
|
|
35
|
+
"deu-Latn",
|
|
36
|
+
"ell-Grek",
|
|
37
|
+
"eng-Latn",
|
|
38
|
+
"spa-Latn",
|
|
39
|
+
"est-Latn",
|
|
40
|
+
"eus-Latn",
|
|
41
|
+
"fas-Arab",
|
|
42
|
+
"fin-Latn",
|
|
43
|
+
"fra-Latn",
|
|
44
|
+
"glg-Latn",
|
|
45
|
+
"guj-Gujr",
|
|
46
|
+
"heb-Hebr",
|
|
47
|
+
"hin-Deva",
|
|
48
|
+
"hrv-Latn",
|
|
49
|
+
"hat-Latn",
|
|
50
|
+
"hun-Latn",
|
|
51
|
+
"hye-Armn",
|
|
52
|
+
"ind-Latn",
|
|
53
|
+
"isl-Latn",
|
|
54
|
+
"ita-Latn",
|
|
55
|
+
"jpn-Jpan",
|
|
56
|
+
"jav-Latn",
|
|
57
|
+
"kat-Geor",
|
|
58
|
+
"kaz-Cyrl",
|
|
59
|
+
"khm-Khmr",
|
|
60
|
+
"kan-Knda",
|
|
61
|
+
"kor-Hang",
|
|
62
|
+
"kir-Cyrl",
|
|
63
|
+
"lao-Laoo",
|
|
64
|
+
"lit-Latn",
|
|
65
|
+
"lav-Latn",
|
|
66
|
+
"mkd-Cyrl",
|
|
67
|
+
"mal-Mlym",
|
|
68
|
+
"mon-Cyrl",
|
|
69
|
+
"mar-Deva",
|
|
70
|
+
"msa-Latn",
|
|
71
|
+
"mya-Mymr",
|
|
72
|
+
"nep-Deva",
|
|
73
|
+
"nld-Latn",
|
|
74
|
+
"nor-Latn",
|
|
75
|
+
"nob-Latn",
|
|
76
|
+
"nno-Latn",
|
|
77
|
+
"pan-Guru",
|
|
78
|
+
"pol-Latn",
|
|
79
|
+
"por-Latn",
|
|
80
|
+
"que-Latn",
|
|
81
|
+
"ron-Latn",
|
|
82
|
+
"rus-Cyrl",
|
|
83
|
+
"sin-Sinh",
|
|
84
|
+
"slk-Latn",
|
|
85
|
+
"slv-Latn",
|
|
86
|
+
"swa-Latn",
|
|
87
|
+
"tam-Taml",
|
|
88
|
+
"tel-Telu",
|
|
89
|
+
"tha-Thai",
|
|
90
|
+
"tgl-Latn",
|
|
91
|
+
"tur-Latn",
|
|
92
|
+
"ukr-Cyrl",
|
|
93
|
+
"urd-Arab",
|
|
94
|
+
"vie-Latn",
|
|
95
|
+
"yor-Latn",
|
|
96
|
+
"zho-Hans",
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
OCTEN_CITATION = """@misc{octen-embedding-2025,
|
|
100
|
+
title={Octen-Embedding-8B: A Fine-tuned Multilingual Text Embedding Model},
|
|
101
|
+
author={Octen Team},
|
|
102
|
+
year={2025},
|
|
103
|
+
url={https://huggingface.co/bflhc/bflhc/Octen-Embedding-8B}
|
|
104
|
+
}"""
|
|
105
|
+
|
|
106
|
+
training_data = {
|
|
107
|
+
"T2Retrieval",
|
|
108
|
+
"DuRetrieval",
|
|
109
|
+
"MMarcoReranking",
|
|
110
|
+
"CMedQAv2-reranking",
|
|
111
|
+
"NQ",
|
|
112
|
+
"MSMARCO",
|
|
113
|
+
"HotpotQA",
|
|
114
|
+
"FEVER",
|
|
115
|
+
"MrTidyRetrieval",
|
|
116
|
+
"MIRACLRetrieval",
|
|
117
|
+
"CodeSearchNet",
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
# Predefined prompts for various RTEB tasks
|
|
121
|
+
_PREDEFINED_PROMPTS = {
|
|
122
|
+
# ========== Open Datasets ==========
|
|
123
|
+
# Legal domain
|
|
124
|
+
"AILACasedocs": "Given a legal case scenario, retrieve the most relevant case documents",
|
|
125
|
+
"AILAStatutes": "Given a legal scenario, retrieve the most relevant statute documents",
|
|
126
|
+
"LegalQuAD": "Given a legal question, retrieve relevant legal documents that answer the question",
|
|
127
|
+
"LegalSummarization": "Given a query, retrieve relevant legal documents for summarization",
|
|
128
|
+
# Code domain
|
|
129
|
+
"AppsRetrieval": "Given a query about mobile applications, retrieve relevant app information",
|
|
130
|
+
"HumanEvalRetrieval": "Given a code problem description, retrieve relevant code examples",
|
|
131
|
+
"MBPPRetrieval": "Given a programming problem description, retrieve relevant code solutions",
|
|
132
|
+
"DS1000Retrieval": "Given a data science problem, retrieve relevant code snippets",
|
|
133
|
+
"FreshStackRetrieval": "Given a programming question, retrieve relevant Stack Overflow posts",
|
|
134
|
+
# Finance domain
|
|
135
|
+
"FinQARetrieval": "Given a financial question, retrieve relevant financial documents",
|
|
136
|
+
"FinanceBenchRetrieval": "Given a financial query, retrieve relevant financial information",
|
|
137
|
+
"HC3FinanceRetrieval": "Given a finance-related query, retrieve relevant documents",
|
|
138
|
+
# Medical domain
|
|
139
|
+
"CUREv1": "Given a medical query, retrieve relevant clinical documents",
|
|
140
|
+
"ChatDoctorRetrieval": "Given a medical question, retrieve relevant medical information",
|
|
141
|
+
# SQL domain
|
|
142
|
+
"WikiSQLRetrieval": "Given a natural language query, retrieve relevant SQL examples",
|
|
143
|
+
# Multilingual
|
|
144
|
+
"MIRACLRetrievalHardNegatives": "Given a question, retrieve Wikipedia passages that answer the question",
|
|
145
|
+
# ========== Private/Closed Datasets ==========
|
|
146
|
+
# Code domain (Private)
|
|
147
|
+
"Code1Retrieval": "Given a code problem description, retrieve relevant code examples",
|
|
148
|
+
"JapaneseCode1Retrieval": "Given a code problem description, retrieve relevant code examples",
|
|
149
|
+
# Finance domain (Private)
|
|
150
|
+
"EnglishFinance1Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
151
|
+
"EnglishFinance2Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
152
|
+
"EnglishFinance3Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
153
|
+
"EnglishFinance4Retrieval": "Given a financial query, retrieve relevant financial documents",
|
|
154
|
+
# Healthcare domain (Private)
|
|
155
|
+
"EnglishHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
|
|
156
|
+
"GermanHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
|
|
157
|
+
# Legal domain (Private)
|
|
158
|
+
"FrenchLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
|
|
159
|
+
"GermanLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
|
|
160
|
+
"JapaneseLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
|
|
161
|
+
# General/Multilingual (Private)
|
|
162
|
+
"French1Retrieval": "Given a query, retrieve relevant passages",
|
|
163
|
+
"German1Retrieval": "Given a query, retrieve relevant passages",
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
Octen_Embedding_8B = ModelMeta(
|
|
168
|
+
loader=InstructSentenceTransformerModel,
|
|
169
|
+
loader_kwargs=dict(
|
|
170
|
+
instruction_template=instruction_template,
|
|
171
|
+
apply_instruction_to_passages=True,
|
|
172
|
+
prompts_dict=_PREDEFINED_PROMPTS,
|
|
173
|
+
max_seq_length=18480,
|
|
174
|
+
model_kwargs={"torch_dtype": "bfloat16"},
|
|
175
|
+
),
|
|
176
|
+
name="bflhc/Octen-Embedding-8B",
|
|
177
|
+
languages=multilingual_langs,
|
|
178
|
+
open_weights=True,
|
|
179
|
+
revision="2030603c2926ab005fafd824fac5911e271be21f",
|
|
180
|
+
release_date="2025-12-23",
|
|
181
|
+
n_parameters=7567295488,
|
|
182
|
+
memory_usage_mb=14433,
|
|
183
|
+
embed_dim=4096,
|
|
184
|
+
max_tokens=32768,
|
|
185
|
+
license="apache-2.0",
|
|
186
|
+
reference="https://huggingface.co/bflhc/Octen-Embedding-8B",
|
|
187
|
+
similarity_fn_name="cosine",
|
|
188
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
189
|
+
use_instructions=True,
|
|
190
|
+
public_training_code=None,
|
|
191
|
+
public_training_data=None,
|
|
192
|
+
training_datasets=training_data,
|
|
193
|
+
citation=OCTEN_CITATION,
|
|
194
|
+
adapted_from="Qwen/Qwen3-Embedding-8B",
|
|
195
|
+
)
|
|
@@ -91,10 +91,6 @@ class OpenAIModel(AbsEncoder):
|
|
|
91
91
|
|
|
92
92
|
from openai import NotGiven
|
|
93
93
|
|
|
94
|
-
if self.model_name == "text-embedding-ada-002" and self._embed_dim is not None:
|
|
95
|
-
logger.warning(
|
|
96
|
-
"Reducing embedding size available only for text-embedding-3-* models"
|
|
97
|
-
)
|
|
98
94
|
sentences = [text for batch in inputs for text in batch["text"]]
|
|
99
95
|
|
|
100
96
|
mask_sents = [(i, t) for i, t in enumerate(sentences) if t.strip()]
|
|
@@ -122,13 +118,22 @@ class OpenAIModel(AbsEncoder):
|
|
|
122
118
|
|
|
123
119
|
no_empty_embeddings = []
|
|
124
120
|
|
|
121
|
+
# Set dimensions only for models that support it
|
|
122
|
+
dimensions = (
|
|
123
|
+
self._embed_dim or NotGiven()
|
|
124
|
+
if not self.model_name == "text-embedding-ada-002"
|
|
125
|
+
else NotGiven()
|
|
126
|
+
)
|
|
127
|
+
default_kwargs = dict(
|
|
128
|
+
model=self.model_name,
|
|
129
|
+
encoding_format="float",
|
|
130
|
+
dimensions=dimensions,
|
|
131
|
+
)
|
|
132
|
+
|
|
125
133
|
for sublist in tqdm(sublists, leave=False, disable=not show_progress_bar):
|
|
126
134
|
try:
|
|
127
135
|
response = self._client.embeddings.create(
|
|
128
|
-
input=sublist,
|
|
129
|
-
model=self.model_name,
|
|
130
|
-
encoding_format="float",
|
|
131
|
-
dimensions=self._embed_dim or NotGiven(),
|
|
136
|
+
input=sublist, **default_kwargs
|
|
132
137
|
)
|
|
133
138
|
except Exception as e:
|
|
134
139
|
# Sleep due to too many requests
|
|
@@ -138,19 +143,13 @@ class OpenAIModel(AbsEncoder):
|
|
|
138
143
|
time.sleep(10)
|
|
139
144
|
try:
|
|
140
145
|
response = self._client.embeddings.create(
|
|
141
|
-
input=sublist,
|
|
142
|
-
model=self.model_name,
|
|
143
|
-
encoding_format="float",
|
|
144
|
-
dimensions=self._embed_dim or NotGiven(),
|
|
146
|
+
input=sublist, **default_kwargs
|
|
145
147
|
)
|
|
146
148
|
except Exception as e:
|
|
147
149
|
logger.info("Sleeping for 60 seconds due to error", e)
|
|
148
150
|
time.sleep(60)
|
|
149
151
|
response = self._client.embeddings.create(
|
|
150
|
-
input=sublist,
|
|
151
|
-
model=self.model_name,
|
|
152
|
-
encoding_format="float",
|
|
153
|
-
dimensions=self._embed_dim or NotGiven(),
|
|
152
|
+
input=sublist, **default_kwargs
|
|
154
153
|
)
|
|
155
154
|
no_empty_embeddings.extend(self._to_numpy(response))
|
|
156
155
|
|
|
@@ -168,6 +167,7 @@ class OpenAIModel(AbsEncoder):
|
|
|
168
167
|
|
|
169
168
|
text_embedding_3_small = ModelMeta(
|
|
170
169
|
name="openai/text-embedding-3-small",
|
|
170
|
+
model_type=["dense"],
|
|
171
171
|
revision="3",
|
|
172
172
|
release_date="2024-01-25",
|
|
173
173
|
languages=None, # supported languages not specified
|
|
@@ -192,6 +192,7 @@ text_embedding_3_small = ModelMeta(
|
|
|
192
192
|
)
|
|
193
193
|
text_embedding_3_large = ModelMeta(
|
|
194
194
|
name="openai/text-embedding-3-large",
|
|
195
|
+
model_type=["dense"],
|
|
195
196
|
revision="3",
|
|
196
197
|
release_date="2024-01-25",
|
|
197
198
|
languages=None, # supported languages not specified
|
|
@@ -216,6 +217,7 @@ text_embedding_3_large = ModelMeta(
|
|
|
216
217
|
)
|
|
217
218
|
text_embedding_ada_002 = ModelMeta(
|
|
218
219
|
name="openai/text-embedding-ada-002",
|
|
220
|
+
model_type=["dense"],
|
|
219
221
|
revision="3",
|
|
220
222
|
release_date="2022-12-15",
|
|
221
223
|
languages=None, # supported languages not specified
|
|
@@ -241,6 +243,7 @@ text_embedding_ada_002 = ModelMeta(
|
|
|
241
243
|
|
|
242
244
|
text_embedding_3_small_512 = ModelMeta(
|
|
243
245
|
name="openai/text-embedding-3-small (embed_dim=512)",
|
|
246
|
+
model_type=["dense"],
|
|
244
247
|
revision="3",
|
|
245
248
|
release_date="2024-01-25",
|
|
246
249
|
languages=None, # supported languages not specified
|
|
@@ -267,6 +270,7 @@ text_embedding_3_small_512 = ModelMeta(
|
|
|
267
270
|
|
|
268
271
|
text_embedding_3_large_512 = ModelMeta(
|
|
269
272
|
name="openai/text-embedding-3-large (embed_dim=512)",
|
|
273
|
+
model_type=["dense"],
|
|
270
274
|
revision="3",
|
|
271
275
|
release_date="2024-01-25",
|
|
272
276
|
languages=None, # supported languages not specified
|