mteb 2.7.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +63 -14
- mteb/_evaluators/any_sts_evaluator.py +12 -5
- mteb/_evaluators/clustering_evaluator.py +12 -4
- mteb/_evaluators/evaluator.py +11 -5
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +14 -5
- mteb/_evaluators/pair_classification_evaluator.py +13 -5
- mteb/_evaluators/retrieval_evaluator.py +22 -13
- mteb/_evaluators/retrieval_metrics.py +9 -3
- mteb/_evaluators/sklearn_evaluator.py +20 -11
- mteb/_evaluators/text/bitext_mining_evaluator.py +10 -3
- mteb/_evaluators/text/summarization_evaluator.py +10 -4
- mteb/_evaluators/zeroshot_classification_evaluator.py +12 -3
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +8 -2
- mteb/abstasks/_data_filter/task_pipelines.py +7 -2
- mteb/abstasks/_statistics_calculation.py +6 -4
- mteb/abstasks/abstask.py +48 -21
- mteb/abstasks/aggregate_task_metadata.py +20 -9
- mteb/abstasks/aggregated_task.py +15 -8
- mteb/abstasks/classification.py +25 -9
- mteb/abstasks/clustering.py +23 -10
- mteb/abstasks/clustering_legacy.py +22 -8
- mteb/abstasks/image/image_text_pair_classification.py +23 -9
- mteb/abstasks/multilabel_classification.py +13 -5
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +14 -6
- mteb/abstasks/retrieval.py +56 -30
- mteb/abstasks/retrieval_dataset_loaders.py +48 -37
- mteb/abstasks/sts.py +29 -13
- mteb/abstasks/task_metadata.py +17 -8
- mteb/abstasks/text/bitext_mining.py +23 -12
- mteb/abstasks/text/reranking.py +2 -2
- mteb/abstasks/text/summarization.py +19 -8
- mteb/abstasks/zeroshot_classification.py +23 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +11 -1
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +41 -2
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/cache.py +10 -5
- mteb/cli/_display_tasks.py +9 -3
- mteb/cli/build_cli.py +5 -2
- mteb/cli/generate_model_card.py +9 -2
- mteb/deprecated_evaluator.py +16 -12
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/evaluate.py +33 -20
- mteb/filter_tasks.py +12 -7
- mteb/get_tasks.py +9 -4
- mteb/languages/language_scripts.py +8 -3
- mteb/leaderboard/app.py +11 -4
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +9 -3
- mteb/models/abs_encoder.py +22 -12
- mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
- mteb/models/cache_wrappers/cache_wrapper.py +14 -9
- mteb/models/get_model_meta.py +32 -6
- mteb/models/instruct_wrapper.py +13 -5
- mteb/models/model_implementations/align_models.py +10 -4
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +2 -0
- mteb/models/model_implementations/ara_models.py +1 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +40 -1
- mteb/models/model_implementations/bica_model.py +1 -0
- mteb/models/model_implementations/blip2_models.py +11 -4
- mteb/models/model_implementations/blip_models.py +17 -4
- mteb/models/model_implementations/bm25.py +24 -14
- mteb/models/model_implementations/bmretriever_models.py +10 -2
- mteb/models/model_implementations/cadet_models.py +1 -0
- mteb/models/model_implementations/cde_models.py +11 -5
- mteb/models/model_implementations/clip_models.py +12 -4
- mteb/models/model_implementations/clips_models.py +3 -0
- mteb/models/model_implementations/codefuse_models.py +5 -0
- mteb/models/model_implementations/codesage_models.py +3 -0
- mteb/models/model_implementations/cohere_models.py +14 -4
- mteb/models/model_implementations/cohere_v.py +14 -4
- mteb/models/model_implementations/colpali_models.py +7 -3
- mteb/models/model_implementations/colqwen_models.py +17 -31
- mteb/models/model_implementations/colsmol_models.py +3 -1
- mteb/models/model_implementations/conan_models.py +11 -4
- mteb/models/model_implementations/dino_models.py +28 -4
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +9 -0
- mteb/models/model_implementations/e5_v.py +10 -4
- mteb/models/model_implementations/eagerworks_models.py +11 -4
- mteb/models/model_implementations/emillykkejensen_models.py +3 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +1 -0
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +9 -0
- mteb/models/model_implementations/facebookai.py +2 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +7 -3
- mteb/models/model_implementations/google_models.py +15 -4
- mteb/models/model_implementations/granite_vision_embedding_models.py +7 -5
- mteb/models/model_implementations/gritlm_models.py +3 -0
- mteb/models/model_implementations/gte_models.py +9 -0
- mteb/models/model_implementations/hinvec_models.py +6 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +14 -5
- mteb/models/model_implementations/jina_clip.py +10 -4
- mteb/models/model_implementations/jina_models.py +17 -5
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +1 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
- mteb/models/model_implementations/kfst.py +1 -0
- mteb/models/model_implementations/kowshik24_models.py +1 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +7 -1
- mteb/models/model_implementations/listconranker.py +10 -4
- mteb/models/model_implementations/llm2clip_models.py +12 -4
- mteb/models/model_implementations/llm2vec_models.py +20 -6
- mteb/models/model_implementations/mcinext_models.py +8 -2
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +63 -0
- mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +11 -4
- mteb/models/model_implementations/mod_models.py +2 -1
- mteb/models/model_implementations/model2vec_models.py +23 -4
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/nbailab.py +3 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +13 -5
- mteb/models/model_implementations/nomic_models.py +17 -4
- mteb/models/model_implementations/nomic_models_vision.py +5 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +9 -3
- mteb/models/model_implementations/nvidia_models.py +15 -4
- mteb/models/model_implementations/octen_models.py +3 -1
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +17 -4
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +15 -4
- mteb/models/model_implementations/ops_moa_models.py +9 -2
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
- mteb/models/model_implementations/pawan_models.py +1 -0
- mteb/models/model_implementations/piccolo_models.py +2 -0
- mteb/models/model_implementations/promptriever_models.py +16 -6
- mteb/models/model_implementations/pylate_models.py +32 -13
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +11 -1
- mteb/models/model_implementations/qzhou_models.py +2 -0
- mteb/models/model_implementations/random_baseline.py +4 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -0
- mteb/models/model_implementations/reasonir_model.py +65 -0
- mteb/models/model_implementations/repllama_models.py +15 -6
- mteb/models/model_implementations/rerankers_custom.py +13 -4
- mteb/models/model_implementations/rerankers_monot5_based.py +24 -4
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +20 -0
- mteb/models/model_implementations/ruri_models.py +10 -0
- mteb/models/model_implementations/salesforce_models.py +10 -1
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +6 -2
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +18 -0
- mteb/models/model_implementations/shuu_model.py +1 -0
- mteb/models/model_implementations/siglip_models.py +19 -4
- mteb/models/model_implementations/slm_models.py +7 -4
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +2 -0
- mteb/models/model_implementations/text2vec_models.py +3 -0
- mteb/models/model_implementations/ua_sentence_models.py +1 -0
- mteb/models/model_implementations/uae_models.py +10 -4
- mteb/models/model_implementations/vdr_models.py +8 -1
- mteb/models/model_implementations/vi_vn_models.py +6 -0
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +11 -4
- mteb/models/model_implementations/voyage_models.py +52 -4
- mteb/models/model_implementations/voyage_v.py +11 -6
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +1 -0
- mteb/models/model_implementations/yuan_models_en.py +2 -1
- mteb/models/model_meta.py +47 -9
- mteb/models/models_protocols.py +23 -18
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
- mteb/models/search_wrappers.py +31 -12
- mteb/models/sentence_transformer_wrapper.py +4 -3
- mteb/models/vllm_wrapper.py +8 -6
- mteb/results/benchmark_results.py +22 -17
- mteb/results/model_result.py +21 -15
- mteb/results/task_result.py +32 -16
- mteb/similarity_functions.py +8 -2
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +4 -4
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +42 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +1 -1
- mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/_encoder_io.py +1 -1
- mteb/types/statistics.py +9 -2
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/METADATA +1 -1
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/RECORD +486 -465
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,18 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
|
-
from torch.utils.data import DataLoader
|
|
5
6
|
from tqdm.auto import tqdm
|
|
6
7
|
|
|
7
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
8
8
|
from mteb.models.abs_encoder import AbsEncoder
|
|
9
9
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
10
|
-
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from torch.utils.data import DataLoader
|
|
13
|
+
|
|
14
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
11
16
|
|
|
12
17
|
SIGLIP_CITATION = """@misc{zhai2023sigmoid,
|
|
13
18
|
title={Sigmoid Loss for Language Image Pre-Training},
|
|
@@ -131,6 +136,7 @@ siglip_so400m_patch14_224 = ModelMeta(
|
|
|
131
136
|
release_date="2024-01-08",
|
|
132
137
|
modalities=["image", "text"],
|
|
133
138
|
n_parameters=877_000_000,
|
|
139
|
+
n_embedding_parameters=None,
|
|
134
140
|
memory_usage_mb=3347,
|
|
135
141
|
max_tokens=16,
|
|
136
142
|
embed_dim=1152,
|
|
@@ -155,6 +161,7 @@ siglip_so400m_patch14_384 = ModelMeta(
|
|
|
155
161
|
release_date="2024-01-08",
|
|
156
162
|
modalities=["image", "text"],
|
|
157
163
|
n_parameters=878_000_000,
|
|
164
|
+
n_embedding_parameters=None,
|
|
158
165
|
memory_usage_mb=3349,
|
|
159
166
|
max_tokens=64,
|
|
160
167
|
embed_dim=1152,
|
|
@@ -179,6 +186,7 @@ siglip_so400m_patch16_256_i18n = ModelMeta(
|
|
|
179
186
|
release_date="2024-01-08",
|
|
180
187
|
modalities=["image", "text"],
|
|
181
188
|
n_parameters=1_130_000_000,
|
|
189
|
+
n_embedding_parameters=None,
|
|
182
190
|
memory_usage_mb=4306,
|
|
183
191
|
max_tokens=64,
|
|
184
192
|
embed_dim=1152,
|
|
@@ -203,6 +211,7 @@ siglip_base_patch16_256_multilingual = ModelMeta(
|
|
|
203
211
|
release_date="2024-01-08",
|
|
204
212
|
modalities=["image", "text"],
|
|
205
213
|
n_parameters=371_000_000,
|
|
214
|
+
n_embedding_parameters=None,
|
|
206
215
|
memory_usage_mb=1414,
|
|
207
216
|
max_tokens=64,
|
|
208
217
|
embed_dim=768,
|
|
@@ -227,6 +236,7 @@ siglip_base_patch16_256 = ModelMeta(
|
|
|
227
236
|
release_date="2024-01-08",
|
|
228
237
|
modalities=["image", "text"],
|
|
229
238
|
n_parameters=203_000_000,
|
|
239
|
+
n_embedding_parameters=None,
|
|
230
240
|
memory_usage_mb=775,
|
|
231
241
|
max_tokens=64,
|
|
232
242
|
embed_dim=768,
|
|
@@ -251,6 +261,7 @@ siglip_base_patch16_512 = ModelMeta(
|
|
|
251
261
|
release_date="2024-01-08",
|
|
252
262
|
modalities=["image", "text"],
|
|
253
263
|
n_parameters=204_000_000,
|
|
264
|
+
n_embedding_parameters=None,
|
|
254
265
|
memory_usage_mb=777,
|
|
255
266
|
max_tokens=64,
|
|
256
267
|
embed_dim=768,
|
|
@@ -275,6 +286,7 @@ siglip_base_patch16_384 = ModelMeta(
|
|
|
275
286
|
release_date="2024-01-08",
|
|
276
287
|
modalities=["image", "text"],
|
|
277
288
|
n_parameters=203_000_000,
|
|
289
|
+
n_embedding_parameters=None,
|
|
278
290
|
memory_usage_mb=776,
|
|
279
291
|
max_tokens=64,
|
|
280
292
|
embed_dim=768,
|
|
@@ -299,6 +311,7 @@ siglip_base_patch16_224 = ModelMeta(
|
|
|
299
311
|
release_date="2024-01-08",
|
|
300
312
|
modalities=["image", "text"],
|
|
301
313
|
n_parameters=203_000_000,
|
|
314
|
+
n_embedding_parameters=None,
|
|
302
315
|
memory_usage_mb=775,
|
|
303
316
|
max_tokens=64,
|
|
304
317
|
embed_dim=768,
|
|
@@ -323,6 +336,7 @@ siglip_large_patch16_256 = ModelMeta(
|
|
|
323
336
|
release_date="2024-01-08",
|
|
324
337
|
modalities=["image", "text"],
|
|
325
338
|
n_parameters=652_000_000,
|
|
339
|
+
n_embedding_parameters=None,
|
|
326
340
|
memory_usage_mb=2488,
|
|
327
341
|
max_tokens=64,
|
|
328
342
|
embed_dim=1024,
|
|
@@ -347,6 +361,7 @@ siglip_large_patch16_384 = ModelMeta(
|
|
|
347
361
|
release_date="2024-01-08",
|
|
348
362
|
modalities=["image", "text"],
|
|
349
363
|
n_parameters=652_000_000,
|
|
364
|
+
n_embedding_parameters=None,
|
|
350
365
|
memory_usage_mb=2489,
|
|
351
366
|
max_tokens=64,
|
|
352
367
|
embed_dim=1024,
|
|
@@ -13,24 +13,27 @@ Based on:
|
|
|
13
13
|
from __future__ import annotations
|
|
14
14
|
|
|
15
15
|
import logging
|
|
16
|
-
from typing import Any
|
|
16
|
+
from typing import TYPE_CHECKING, Any
|
|
17
17
|
|
|
18
18
|
import torch
|
|
19
|
-
from torch.utils.data import DataLoader
|
|
20
19
|
from tqdm.auto import tqdm
|
|
21
20
|
|
|
22
21
|
from mteb._requires_package import (
|
|
23
22
|
requires_image_dependencies,
|
|
24
23
|
requires_package,
|
|
25
24
|
)
|
|
26
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
27
25
|
from mteb.models.abs_encoder import AbsEncoder
|
|
28
26
|
from mteb.models.model_implementations.colpali_models import (
|
|
29
27
|
COLPALI_CITATION,
|
|
30
28
|
COLPALI_TRAINING_DATA,
|
|
31
29
|
)
|
|
32
30
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
33
|
-
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from torch.utils.data import DataLoader
|
|
34
|
+
|
|
35
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
36
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
34
37
|
|
|
35
38
|
logger = logging.getLogger(__name__)
|
|
36
39
|
|
|
@@ -224,7 +224,8 @@ sonar = ModelMeta(
|
|
|
224
224
|
use_instructions=False, # it does take a language code as input
|
|
225
225
|
revision="a551c586dcf4a49c8fd847de369412d556a7f2f2",
|
|
226
226
|
release_date="2021-05-21",
|
|
227
|
-
n_parameters=None,
|
|
227
|
+
n_parameters=None,
|
|
228
|
+
n_embedding_parameters=None, # it is really multiple models so not sure how to calculate this
|
|
228
229
|
max_tokens=512, # https://github.com/facebookresearch/SONAR/blob/549d287466443bd8720f938047882630c1c5c3f7/sonar/models/sonar_text/builder.py#L139
|
|
229
230
|
embed_dim=1024,
|
|
230
231
|
license="mit",
|
|
@@ -12,6 +12,7 @@ spartan8806_atles_champion_embedding = ModelMeta(
|
|
|
12
12
|
revision="d4c74d7000bbd25f3597fc0f2dcde59ef1386e8f",
|
|
13
13
|
release_date="2025-11-15",
|
|
14
14
|
n_parameters=110_000_000,
|
|
15
|
+
n_embedding_parameters=23_444_736,
|
|
15
16
|
memory_usage_mb=420,
|
|
16
17
|
max_tokens=512,
|
|
17
18
|
embed_dim=768,
|
|
@@ -66,6 +66,7 @@ stella_en_400m = ModelMeta(
|
|
|
66
66
|
revision="1bb50bc7bb726810eac2140e62155b88b0df198f",
|
|
67
67
|
release_date="2024-07-12",
|
|
68
68
|
n_parameters=435_000_000,
|
|
69
|
+
n_embedding_parameters=None,
|
|
69
70
|
memory_usage_mb=1660,
|
|
70
71
|
max_tokens=8192,
|
|
71
72
|
embed_dim=4096,
|
|
@@ -101,6 +102,7 @@ stella_en_1_5b = ModelMeta(
|
|
|
101
102
|
revision="d03be74b361d4eb24f42a2fe5bd2e29917df4604",
|
|
102
103
|
release_date="2024-07-12",
|
|
103
104
|
n_parameters=1_540_000_000,
|
|
105
|
+
n_embedding_parameters=232_928_256,
|
|
104
106
|
memory_usage_mb=5887,
|
|
105
107
|
max_tokens=131072,
|
|
106
108
|
embed_dim=8960,
|
|
@@ -130,6 +132,7 @@ stella_large_zh_v3_1792d = ModelMeta(
|
|
|
130
132
|
revision="d5d39eb8cd11c80a63df53314e59997074469f09",
|
|
131
133
|
release_date="2024-02-17",
|
|
132
134
|
n_parameters=None,
|
|
135
|
+
n_embedding_parameters=21_635_072,
|
|
133
136
|
memory_usage_mb=None, # can't see on model card
|
|
134
137
|
embed_dim=1792,
|
|
135
138
|
license="not specified",
|
|
@@ -157,6 +160,7 @@ stella_base_zh_v3_1792d = ModelMeta(
|
|
|
157
160
|
revision="82254892a0fba125aa2abf3a4800d2dd12821343",
|
|
158
161
|
release_date="2024-02-17",
|
|
159
162
|
n_parameters=None,
|
|
163
|
+
n_embedding_parameters=16_226_304,
|
|
160
164
|
memory_usage_mb=None, # can't see on model card
|
|
161
165
|
embed_dim=1792,
|
|
162
166
|
license="mit",
|
|
@@ -185,6 +189,7 @@ stella_mrl_large_zh_v3_5_1792d = ModelMeta(
|
|
|
185
189
|
revision="17bb1c32a93a8fc5f6fc9e91d5ea86da99983cfe",
|
|
186
190
|
release_date="2024-02-27",
|
|
187
191
|
n_parameters=int(326 * 1e6),
|
|
192
|
+
n_embedding_parameters=21_635_072,
|
|
188
193
|
memory_usage_mb=1242,
|
|
189
194
|
embed_dim=1792,
|
|
190
195
|
license="mit",
|
|
@@ -209,6 +214,7 @@ zpoint_large_embedding_zh = ModelMeta(
|
|
|
209
214
|
revision="b1075144f440ab4409c05622c1179130ebd57d03",
|
|
210
215
|
release_date="2024-06-04",
|
|
211
216
|
n_parameters=int(326 * 1e6),
|
|
217
|
+
n_embedding_parameters=21_635_072,
|
|
212
218
|
memory_usage_mb=1242,
|
|
213
219
|
embed_dim=1792,
|
|
214
220
|
license="mit",
|
|
@@ -327,6 +327,7 @@ tarka_embedding_150m_v1 = ModelMeta(
|
|
|
327
327
|
revision="b0ffecc4ef0d873e517507ed080e43b88b2704b9",
|
|
328
328
|
release_date="2025-11-04",
|
|
329
329
|
n_parameters=155_714_304,
|
|
330
|
+
n_embedding_parameters=None,
|
|
330
331
|
embed_dim=768,
|
|
331
332
|
max_tokens=2048,
|
|
332
333
|
license="gemma",
|
|
@@ -361,6 +362,7 @@ tarka_embedding_350m_v1 = ModelMeta(
|
|
|
361
362
|
revision="a850d6a329145474727424fed6b12b62096b8ba3",
|
|
362
363
|
release_date="2025-11-11",
|
|
363
364
|
n_parameters=354_483_968,
|
|
365
|
+
n_embedding_parameters=None,
|
|
364
366
|
memory_usage_mb=676,
|
|
365
367
|
embed_dim=1024,
|
|
366
368
|
max_tokens=128000,
|
|
@@ -22,6 +22,7 @@ text2vec_base_chinese = ModelMeta(
|
|
|
22
22
|
revision="183bb99aa7af74355fb58d16edf8c13ae7c5433e",
|
|
23
23
|
release_date="2022-01-23",
|
|
24
24
|
n_parameters=int(102 * 1e6),
|
|
25
|
+
n_embedding_parameters=16_226_304,
|
|
25
26
|
embed_dim=768,
|
|
26
27
|
license="apache-2.0",
|
|
27
28
|
max_tokens=512,
|
|
@@ -51,6 +52,7 @@ text2vec_base_chinese_paraphrase = ModelMeta(
|
|
|
51
52
|
revision="e90c150a9c7fb55a67712a766d6820c55fb83cdd",
|
|
52
53
|
release_date="2023-06-19",
|
|
53
54
|
n_parameters=118 * 1e6,
|
|
55
|
+
n_embedding_parameters=30_720_000,
|
|
54
56
|
memory_usage_mb=450,
|
|
55
57
|
embed_dim=768,
|
|
56
58
|
license="apache-2.0",
|
|
@@ -95,6 +97,7 @@ text2vec_base_multilingual = ModelMeta(
|
|
|
95
97
|
# So probably best not to.
|
|
96
98
|
loader=sentence_transformers_loader,
|
|
97
99
|
n_parameters=117654272,
|
|
100
|
+
n_embedding_parameters=96_014_208,
|
|
98
101
|
memory_usage_mb=449,
|
|
99
102
|
embed_dim=384,
|
|
100
103
|
license="apache-2.0",
|
|
@@ -1,13 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from torch.utils.data import DataLoader
|
|
6
7
|
|
|
7
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
8
8
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
9
9
|
from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
|
|
10
|
-
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from torch.utils.data import DataLoader
|
|
13
|
+
|
|
14
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
11
16
|
|
|
12
17
|
logger = logging.getLogger(__name__)
|
|
13
18
|
|
|
@@ -67,6 +72,7 @@ uae_large_v1 = ModelMeta(
|
|
|
67
72
|
revision="369c368f70f16a613f19f5598d4f12d9f44235d4",
|
|
68
73
|
release_date="2023-12-04", # initial commit of hf model.
|
|
69
74
|
n_parameters=int(335 * 1e6),
|
|
75
|
+
n_embedding_parameters=31_254_528,
|
|
70
76
|
memory_usage_mb=1278,
|
|
71
77
|
max_tokens=512,
|
|
72
78
|
embed_dim=1024,
|
|
@@ -1,6 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
1
5
|
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
2
6
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
3
|
-
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from mteb.types import PromptType
|
|
4
10
|
|
|
5
11
|
|
|
6
12
|
def instruction_template(
|
|
@@ -32,6 +38,7 @@ vdr_2b_multi_v1 = ModelMeta(
|
|
|
32
38
|
release_date="2024-01-08",
|
|
33
39
|
modalities=["text"], # TODO: integrate with image
|
|
34
40
|
n_parameters=2_000_000_000,
|
|
41
|
+
n_embedding_parameters=233_373_696,
|
|
35
42
|
memory_usage_mb=4213,
|
|
36
43
|
max_tokens=32768,
|
|
37
44
|
embed_dim=1536,
|
|
@@ -16,6 +16,7 @@ greennode_embedding_large_vn_v1 = ModelMeta(
|
|
|
16
16
|
loader=sentence_transformers_loader,
|
|
17
17
|
open_weights=True,
|
|
18
18
|
n_parameters=568_000_000,
|
|
19
|
+
n_embedding_parameters=256_002_048,
|
|
19
20
|
memory_usage_mb=2167,
|
|
20
21
|
embed_dim=1024,
|
|
21
22
|
license="cc-by-4.0",
|
|
@@ -41,6 +42,7 @@ greennode_embedding_large_vn_mixed_v1 = ModelMeta(
|
|
|
41
42
|
loader=sentence_transformers_loader,
|
|
42
43
|
open_weights=True,
|
|
43
44
|
n_parameters=568_000_000,
|
|
45
|
+
n_embedding_parameters=256_002_048,
|
|
44
46
|
memory_usage_mb=2167,
|
|
45
47
|
embed_dim=1024,
|
|
46
48
|
license="cc-by-4.0",
|
|
@@ -66,6 +68,7 @@ aiteamvn_vietnamese_embeddings = ModelMeta(
|
|
|
66
68
|
loader=sentence_transformers_loader,
|
|
67
69
|
open_weights=True,
|
|
68
70
|
n_parameters=568_000_000,
|
|
71
|
+
n_embedding_parameters=256_002_048,
|
|
69
72
|
memory_usage_mb=2166,
|
|
70
73
|
embed_dim=1024,
|
|
71
74
|
license="cc-by-4.0",
|
|
@@ -98,6 +101,7 @@ hiieu_halong_embedding = ModelMeta(
|
|
|
98
101
|
use_instructions=False,
|
|
99
102
|
open_weights=True,
|
|
100
103
|
n_parameters=278_000_000,
|
|
104
|
+
n_embedding_parameters=192_001_536,
|
|
101
105
|
memory_usage_mb=1061,
|
|
102
106
|
embed_dim=768,
|
|
103
107
|
license="apache-2.0",
|
|
@@ -129,6 +133,7 @@ sup_simcse_vietnamese_phobert_base_ = ModelMeta(
|
|
|
129
133
|
use_instructions=False,
|
|
130
134
|
open_weights=True,
|
|
131
135
|
n_parameters=135_000_000,
|
|
136
|
+
n_embedding_parameters=49_152_768,
|
|
132
137
|
memory_usage_mb=517,
|
|
133
138
|
max_tokens=256,
|
|
134
139
|
embed_dim=768,
|
|
@@ -167,6 +172,7 @@ bkai_foundation_models_vietnamese_bi_encoder = ModelMeta(
|
|
|
167
172
|
use_instructions=False,
|
|
168
173
|
open_weights=True,
|
|
169
174
|
n_parameters=135_000_000,
|
|
175
|
+
n_embedding_parameters=49_152_768,
|
|
170
176
|
memory_usage_mb=515,
|
|
171
177
|
max_tokens=256,
|
|
172
178
|
embed_dim=768,
|
|
@@ -1,14 +1,19 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
|
-
from torch.utils.data import DataLoader
|
|
5
6
|
from tqdm.auto import tqdm
|
|
6
7
|
|
|
7
8
|
from mteb._requires_package import requires_image_dependencies
|
|
8
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
9
|
from mteb.models.abs_encoder import AbsEncoder
|
|
10
10
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
|
-
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from torch.utils.data import DataLoader
|
|
14
|
+
|
|
15
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
16
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
12
17
|
|
|
13
18
|
VISTA_CITATION = """@article{zhou2024vista,
|
|
14
19
|
title={VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval},
|
|
@@ -253,6 +258,7 @@ visualized_bge_base = ModelMeta(
|
|
|
253
258
|
release_date="2024-06-06",
|
|
254
259
|
modalities=["image", "text"],
|
|
255
260
|
n_parameters=196_000_000,
|
|
261
|
+
n_embedding_parameters=None,
|
|
256
262
|
memory_usage_mb=1631,
|
|
257
263
|
max_tokens=512,
|
|
258
264
|
embed_dim=768,
|
|
@@ -281,6 +287,7 @@ visualized_bge_m3 = ModelMeta(
|
|
|
281
287
|
release_date="2024-06-06",
|
|
282
288
|
modalities=["image", "text"],
|
|
283
289
|
n_parameters=872_909_505,
|
|
290
|
+
n_embedding_parameters=None,
|
|
284
291
|
memory_usage_mb=4263,
|
|
285
292
|
max_tokens=8192,
|
|
286
293
|
embed_dim=1024,
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from torch.utils.data import DataLoader
|
|
6
7
|
from tqdm.auto import tqdm
|
|
7
8
|
|
|
8
9
|
from mteb._requires_package import (
|
|
@@ -10,10 +11,14 @@ from mteb._requires_package import (
|
|
|
10
11
|
requires_package,
|
|
11
12
|
suggest_package,
|
|
12
13
|
)
|
|
13
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
14
14
|
from mteb.models.abs_encoder import AbsEncoder
|
|
15
15
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
16
|
-
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from torch.utils.data import DataLoader
|
|
19
|
+
|
|
20
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
21
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
17
22
|
|
|
18
23
|
logger = logging.getLogger(__name__)
|
|
19
24
|
|
|
@@ -275,6 +280,7 @@ vlm2vec_lora = ModelMeta(
|
|
|
275
280
|
release_date="2024-10-08",
|
|
276
281
|
modalities=["image", "text"],
|
|
277
282
|
n_parameters=None,
|
|
283
|
+
n_embedding_parameters=None,
|
|
278
284
|
memory_usage_mb=None,
|
|
279
285
|
max_tokens=131072,
|
|
280
286
|
embed_dim=3072,
|
|
@@ -299,6 +305,7 @@ vlm2vec_full = ModelMeta(
|
|
|
299
305
|
release_date="2024-10-08",
|
|
300
306
|
modalities=["image", "text"],
|
|
301
307
|
n_parameters=4_150_000_000,
|
|
308
|
+
n_embedding_parameters=None,
|
|
302
309
|
memory_usage_mb=7909,
|
|
303
310
|
max_tokens=131072,
|
|
304
311
|
embed_dim=3072,
|
|
@@ -1,16 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import time
|
|
2
4
|
from functools import wraps
|
|
3
|
-
from typing import Any, Literal
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
|
-
from torch.utils.data import DataLoader
|
|
7
8
|
from tqdm.auto import tqdm
|
|
8
9
|
|
|
9
10
|
from mteb._requires_package import requires_package
|
|
10
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
11
11
|
from mteb.models.abs_encoder import AbsEncoder
|
|
12
12
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
13
|
-
from mteb.types import
|
|
13
|
+
from mteb.types import PromptType
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from torch.utils.data import DataLoader
|
|
17
|
+
|
|
18
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
19
|
+
from mteb.types import Array, BatchedInput
|
|
14
20
|
|
|
15
21
|
VOYAGE_TRAINING_DATA = set(
|
|
16
22
|
# Self-reported (message from VoyageAI member)
|
|
@@ -176,6 +182,7 @@ class VoyageModel(AbsEncoder):
|
|
|
176
182
|
model=self._model_name,
|
|
177
183
|
input_type=input_type,
|
|
178
184
|
output_dtype=output_dtype,
|
|
185
|
+
output_dimension=self.mteb_model_meta.embed_dim,
|
|
179
186
|
).embeddings
|
|
180
187
|
)
|
|
181
188
|
pbar.update(len(batch))
|
|
@@ -209,6 +216,32 @@ model_prompts = {
|
|
|
209
216
|
PromptType.document.value: "document",
|
|
210
217
|
}
|
|
211
218
|
|
|
219
|
+
voyage_4_large_2048d = ModelMeta(
|
|
220
|
+
name="voyageai/voyage-4-large (embed_dim=2048)",
|
|
221
|
+
model_type=["dense"],
|
|
222
|
+
revision="1",
|
|
223
|
+
release_date="2026-01-15",
|
|
224
|
+
languages=None, # supported languages not specified
|
|
225
|
+
loader=VoyageModel,
|
|
226
|
+
loader_kwargs=dict(
|
|
227
|
+
max_tokens=32000,
|
|
228
|
+
model_prompts=model_prompts,
|
|
229
|
+
),
|
|
230
|
+
max_tokens=32000,
|
|
231
|
+
embed_dim=2048,
|
|
232
|
+
open_weights=False,
|
|
233
|
+
n_parameters=None,
|
|
234
|
+
memory_usage_mb=None,
|
|
235
|
+
license=None,
|
|
236
|
+
reference="https://blog.voyageai.com/2026/01/15/voyage-4/",
|
|
237
|
+
similarity_fn_name="cosine",
|
|
238
|
+
framework=["API"],
|
|
239
|
+
use_instructions=True,
|
|
240
|
+
training_datasets=VOYAGE_TRAINING_DATA,
|
|
241
|
+
public_training_code=None,
|
|
242
|
+
public_training_data=None,
|
|
243
|
+
)
|
|
244
|
+
|
|
212
245
|
voyage_4 = ModelMeta(
|
|
213
246
|
name="voyageai/voyage-4",
|
|
214
247
|
model_type=["dense"],
|
|
@@ -302,6 +335,7 @@ voyage_3_large = ModelMeta(
|
|
|
302
335
|
embed_dim=1024,
|
|
303
336
|
open_weights=False,
|
|
304
337
|
n_parameters=None,
|
|
338
|
+
n_embedding_parameters=None,
|
|
305
339
|
memory_usage_mb=None,
|
|
306
340
|
license=None,
|
|
307
341
|
reference="https://blog.voyageai.com/2025/01/07/voyage-3-large/",
|
|
@@ -330,6 +364,7 @@ voyage_3_5 = ModelMeta(
|
|
|
330
364
|
embed_dim=1024,
|
|
331
365
|
open_weights=False,
|
|
332
366
|
n_parameters=None,
|
|
367
|
+
n_embedding_parameters=None,
|
|
333
368
|
memory_usage_mb=None,
|
|
334
369
|
license=None,
|
|
335
370
|
reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
|
|
@@ -357,6 +392,7 @@ voyage_3_5_int8 = ModelMeta(
|
|
|
357
392
|
embed_dim=1024,
|
|
358
393
|
open_weights=False,
|
|
359
394
|
n_parameters=None,
|
|
395
|
+
n_embedding_parameters=None,
|
|
360
396
|
memory_usage_mb=None,
|
|
361
397
|
license=None,
|
|
362
398
|
reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
|
|
@@ -384,6 +420,7 @@ voyage_3_5_binary = ModelMeta(
|
|
|
384
420
|
embed_dim=1024, # Same as original after unpacking from bits
|
|
385
421
|
open_weights=False,
|
|
386
422
|
n_parameters=None,
|
|
423
|
+
n_embedding_parameters=None,
|
|
387
424
|
memory_usage_mb=None,
|
|
388
425
|
license=None,
|
|
389
426
|
reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
|
|
@@ -411,6 +448,7 @@ voyage_large_2_instruct = ModelMeta(
|
|
|
411
448
|
embed_dim=1024,
|
|
412
449
|
open_weights=False,
|
|
413
450
|
n_parameters=None,
|
|
451
|
+
n_embedding_parameters=None,
|
|
414
452
|
memory_usage_mb=None,
|
|
415
453
|
license=None,
|
|
416
454
|
reference="https://blog.voyageai.com/2024/05/05/voyage-large-2-instruct-instruction-tuned-and-rank-1-on-mteb/",
|
|
@@ -437,6 +475,7 @@ voyage_finance_2 = ModelMeta(
|
|
|
437
475
|
embed_dim=1024,
|
|
438
476
|
open_weights=False,
|
|
439
477
|
n_parameters=None,
|
|
478
|
+
n_embedding_parameters=None,
|
|
440
479
|
memory_usage_mb=None,
|
|
441
480
|
license=None,
|
|
442
481
|
reference="https://blog.voyageai.com/2024/06/03/domain-specific-embeddings-finance-edition-voyage-finance-2/",
|
|
@@ -463,6 +502,7 @@ voyage_law_2 = ModelMeta(
|
|
|
463
502
|
embed_dim=1024,
|
|
464
503
|
open_weights=False,
|
|
465
504
|
n_parameters=None,
|
|
505
|
+
n_embedding_parameters=None,
|
|
466
506
|
memory_usage_mb=None,
|
|
467
507
|
license=None,
|
|
468
508
|
reference="https://blog.voyageai.com/2024/04/15/domain-specific-embeddings-and-retrieval-legal-edition-voyage-law-2/",
|
|
@@ -489,6 +529,7 @@ voyage_code_2 = ModelMeta(
|
|
|
489
529
|
embed_dim=1536,
|
|
490
530
|
open_weights=False,
|
|
491
531
|
n_parameters=None,
|
|
532
|
+
n_embedding_parameters=None,
|
|
492
533
|
memory_usage_mb=None,
|
|
493
534
|
license=None,
|
|
494
535
|
reference="https://blog.voyageai.com/2024/01/23/voyage-code-2-elevate-your-code-retrieval/",
|
|
@@ -515,6 +556,7 @@ voyage_code_3 = ModelMeta(
|
|
|
515
556
|
embed_dim=1024,
|
|
516
557
|
open_weights=False,
|
|
517
558
|
n_parameters=None,
|
|
559
|
+
n_embedding_parameters=None,
|
|
518
560
|
memory_usage_mb=None,
|
|
519
561
|
license=None,
|
|
520
562
|
reference="https://blog.voyageai.com/2024/12/04/voyage-code-3/",
|
|
@@ -542,6 +584,7 @@ voyage_large_2 = ModelMeta(
|
|
|
542
584
|
embed_dim=1536,
|
|
543
585
|
open_weights=False,
|
|
544
586
|
n_parameters=None,
|
|
587
|
+
n_embedding_parameters=None,
|
|
545
588
|
memory_usage_mb=None,
|
|
546
589
|
license=None,
|
|
547
590
|
reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/",
|
|
@@ -568,6 +611,7 @@ voyage_2 = ModelMeta(
|
|
|
568
611
|
embed_dim=1024,
|
|
569
612
|
open_weights=False,
|
|
570
613
|
n_parameters=None,
|
|
614
|
+
n_embedding_parameters=None,
|
|
571
615
|
memory_usage_mb=None,
|
|
572
616
|
license=None,
|
|
573
617
|
reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/",
|
|
@@ -593,6 +637,7 @@ voyage_multilingual_2 = ModelMeta(
|
|
|
593
637
|
embed_dim=1024,
|
|
594
638
|
open_weights=False,
|
|
595
639
|
n_parameters=None,
|
|
640
|
+
n_embedding_parameters=None,
|
|
596
641
|
memory_usage_mb=None,
|
|
597
642
|
license=None,
|
|
598
643
|
reference="https://blog.voyageai.com/2024/06/10/voyage-multilingual-2-multilingual-embedding-model/",
|
|
@@ -619,6 +664,7 @@ voyage_3 = ModelMeta(
|
|
|
619
664
|
embed_dim=1024,
|
|
620
665
|
open_weights=False,
|
|
621
666
|
n_parameters=None,
|
|
667
|
+
n_embedding_parameters=None,
|
|
622
668
|
memory_usage_mb=None,
|
|
623
669
|
license=None,
|
|
624
670
|
reference="https://blog.voyageai.com/2024/09/18/voyage-3/",
|
|
@@ -645,6 +691,7 @@ voyage_3_lite = ModelMeta(
|
|
|
645
691
|
embed_dim=512,
|
|
646
692
|
open_weights=False,
|
|
647
693
|
n_parameters=None,
|
|
694
|
+
n_embedding_parameters=None,
|
|
648
695
|
memory_usage_mb=None,
|
|
649
696
|
license=None,
|
|
650
697
|
reference="https://blog.voyageai.com/2024/09/18/voyage-3/",
|
|
@@ -673,6 +720,7 @@ voyage_3_exp = ModelMeta(
|
|
|
673
720
|
open_weights=False,
|
|
674
721
|
# from their card https://huggingface.co/voyageai/voyage-3-m-exp#model-information
|
|
675
722
|
n_parameters=int(6918 * 1e6),
|
|
723
|
+
n_embedding_parameters=None,
|
|
676
724
|
memory_usage_mb=None,
|
|
677
725
|
license=None,
|
|
678
726
|
reference="https://huggingface.co/voyageai/voyage-3-m-exp",
|
|
@@ -4,17 +4,19 @@ import logging
|
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Literal
|
|
5
5
|
|
|
6
6
|
import torch
|
|
7
|
-
from torch.utils.data import DataLoader
|
|
8
7
|
from tqdm.auto import tqdm
|
|
9
8
|
|
|
10
9
|
from mteb._requires_package import requires_image_dependencies, requires_package
|
|
11
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
12
10
|
from mteb.models.abs_encoder import AbsEncoder
|
|
13
11
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
14
|
-
from mteb.types import
|
|
12
|
+
from mteb.types import PromptType
|
|
15
13
|
|
|
16
14
|
if TYPE_CHECKING:
|
|
17
15
|
from PIL import Image
|
|
16
|
+
from torch.utils.data import DataLoader
|
|
17
|
+
|
|
18
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
19
|
+
from mteb.types import Array, BatchedInput
|
|
18
20
|
|
|
19
21
|
logger = logging.getLogger(__name__)
|
|
20
22
|
|
|
@@ -27,6 +29,8 @@ def _downsample_image(
|
|
|
27
29
|
Returns:
|
|
28
30
|
The downsampled image.
|
|
29
31
|
"""
|
|
32
|
+
from PIL.Image import Resampling
|
|
33
|
+
|
|
30
34
|
width, height = image.size
|
|
31
35
|
pixels = width * height
|
|
32
36
|
|
|
@@ -42,15 +46,15 @@ def _downsample_image(
|
|
|
42
46
|
logger.info(
|
|
43
47
|
f"Downsampling image from {width}x{height} to {new_width}x{new_height}"
|
|
44
48
|
)
|
|
45
|
-
return image.resize(new_size,
|
|
49
|
+
return image.resize(new_size, Resampling.LANCZOS)
|
|
46
50
|
if width > height:
|
|
47
51
|
if width > 10000:
|
|
48
52
|
logger.error("Processing extremely wide images.")
|
|
49
|
-
return image.resize((10000, height),
|
|
53
|
+
return image.resize((10000, height), Resampling.LANCZOS)
|
|
50
54
|
else:
|
|
51
55
|
if height > 10000:
|
|
52
56
|
logger.error("Processing extremely high images.")
|
|
53
|
-
return image.resize((width, 10000),
|
|
57
|
+
return image.resize((width, 10000), Resampling.LANCZOS)
|
|
54
58
|
return image
|
|
55
59
|
|
|
56
60
|
|
|
@@ -211,6 +215,7 @@ voyage_v = ModelMeta(
|
|
|
211
215
|
revision="1",
|
|
212
216
|
release_date="2024-11-10",
|
|
213
217
|
n_parameters=None,
|
|
218
|
+
n_embedding_parameters=None,
|
|
214
219
|
memory_usage_mb=None,
|
|
215
220
|
max_tokens=32768,
|
|
216
221
|
embed_dim=1024,
|