mteb 2.7.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +63 -14
- mteb/_evaluators/any_sts_evaluator.py +12 -5
- mteb/_evaluators/clustering_evaluator.py +12 -4
- mteb/_evaluators/evaluator.py +11 -5
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +14 -5
- mteb/_evaluators/pair_classification_evaluator.py +13 -5
- mteb/_evaluators/retrieval_evaluator.py +22 -13
- mteb/_evaluators/retrieval_metrics.py +9 -3
- mteb/_evaluators/sklearn_evaluator.py +20 -11
- mteb/_evaluators/text/bitext_mining_evaluator.py +10 -3
- mteb/_evaluators/text/summarization_evaluator.py +10 -4
- mteb/_evaluators/zeroshot_classification_evaluator.py +12 -3
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +8 -2
- mteb/abstasks/_data_filter/task_pipelines.py +7 -2
- mteb/abstasks/_statistics_calculation.py +6 -4
- mteb/abstasks/abstask.py +48 -21
- mteb/abstasks/aggregate_task_metadata.py +20 -9
- mteb/abstasks/aggregated_task.py +15 -8
- mteb/abstasks/classification.py +25 -9
- mteb/abstasks/clustering.py +23 -10
- mteb/abstasks/clustering_legacy.py +22 -8
- mteb/abstasks/image/image_text_pair_classification.py +23 -9
- mteb/abstasks/multilabel_classification.py +13 -5
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +14 -6
- mteb/abstasks/retrieval.py +56 -30
- mteb/abstasks/retrieval_dataset_loaders.py +48 -37
- mteb/abstasks/sts.py +29 -13
- mteb/abstasks/task_metadata.py +17 -8
- mteb/abstasks/text/bitext_mining.py +23 -12
- mteb/abstasks/text/reranking.py +2 -2
- mteb/abstasks/text/summarization.py +19 -8
- mteb/abstasks/zeroshot_classification.py +23 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +11 -1
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +41 -2
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/cache.py +10 -5
- mteb/cli/_display_tasks.py +9 -3
- mteb/cli/build_cli.py +5 -2
- mteb/cli/generate_model_card.py +9 -2
- mteb/deprecated_evaluator.py +16 -12
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/evaluate.py +33 -20
- mteb/filter_tasks.py +12 -7
- mteb/get_tasks.py +9 -4
- mteb/languages/language_scripts.py +8 -3
- mteb/leaderboard/app.py +11 -4
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +9 -3
- mteb/models/abs_encoder.py +22 -12
- mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
- mteb/models/cache_wrappers/cache_wrapper.py +14 -9
- mteb/models/get_model_meta.py +32 -6
- mteb/models/instruct_wrapper.py +13 -5
- mteb/models/model_implementations/align_models.py +10 -4
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +2 -0
- mteb/models/model_implementations/ara_models.py +1 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +40 -1
- mteb/models/model_implementations/bica_model.py +1 -0
- mteb/models/model_implementations/blip2_models.py +11 -4
- mteb/models/model_implementations/blip_models.py +17 -4
- mteb/models/model_implementations/bm25.py +24 -14
- mteb/models/model_implementations/bmretriever_models.py +10 -2
- mteb/models/model_implementations/cadet_models.py +1 -0
- mteb/models/model_implementations/cde_models.py +11 -5
- mteb/models/model_implementations/clip_models.py +12 -4
- mteb/models/model_implementations/clips_models.py +3 -0
- mteb/models/model_implementations/codefuse_models.py +5 -0
- mteb/models/model_implementations/codesage_models.py +3 -0
- mteb/models/model_implementations/cohere_models.py +14 -4
- mteb/models/model_implementations/cohere_v.py +14 -4
- mteb/models/model_implementations/colpali_models.py +7 -3
- mteb/models/model_implementations/colqwen_models.py +17 -31
- mteb/models/model_implementations/colsmol_models.py +3 -1
- mteb/models/model_implementations/conan_models.py +11 -4
- mteb/models/model_implementations/dino_models.py +28 -4
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +9 -0
- mteb/models/model_implementations/e5_v.py +10 -4
- mteb/models/model_implementations/eagerworks_models.py +11 -4
- mteb/models/model_implementations/emillykkejensen_models.py +3 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +1 -0
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +9 -0
- mteb/models/model_implementations/facebookai.py +2 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +7 -3
- mteb/models/model_implementations/google_models.py +15 -4
- mteb/models/model_implementations/granite_vision_embedding_models.py +7 -5
- mteb/models/model_implementations/gritlm_models.py +3 -0
- mteb/models/model_implementations/gte_models.py +9 -0
- mteb/models/model_implementations/hinvec_models.py +6 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +14 -5
- mteb/models/model_implementations/jina_clip.py +10 -4
- mteb/models/model_implementations/jina_models.py +17 -5
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +1 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
- mteb/models/model_implementations/kfst.py +1 -0
- mteb/models/model_implementations/kowshik24_models.py +1 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +7 -1
- mteb/models/model_implementations/listconranker.py +10 -4
- mteb/models/model_implementations/llm2clip_models.py +12 -4
- mteb/models/model_implementations/llm2vec_models.py +20 -6
- mteb/models/model_implementations/mcinext_models.py +8 -2
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +63 -0
- mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +11 -4
- mteb/models/model_implementations/mod_models.py +2 -1
- mteb/models/model_implementations/model2vec_models.py +23 -4
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/nbailab.py +3 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +13 -5
- mteb/models/model_implementations/nomic_models.py +17 -4
- mteb/models/model_implementations/nomic_models_vision.py +5 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +9 -3
- mteb/models/model_implementations/nvidia_models.py +15 -4
- mteb/models/model_implementations/octen_models.py +3 -1
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +17 -4
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +15 -4
- mteb/models/model_implementations/ops_moa_models.py +9 -2
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
- mteb/models/model_implementations/pawan_models.py +1 -0
- mteb/models/model_implementations/piccolo_models.py +2 -0
- mteb/models/model_implementations/promptriever_models.py +16 -6
- mteb/models/model_implementations/pylate_models.py +32 -13
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +11 -1
- mteb/models/model_implementations/qzhou_models.py +2 -0
- mteb/models/model_implementations/random_baseline.py +4 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -0
- mteb/models/model_implementations/reasonir_model.py +65 -0
- mteb/models/model_implementations/repllama_models.py +15 -6
- mteb/models/model_implementations/rerankers_custom.py +13 -4
- mteb/models/model_implementations/rerankers_monot5_based.py +24 -4
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +20 -0
- mteb/models/model_implementations/ruri_models.py +10 -0
- mteb/models/model_implementations/salesforce_models.py +10 -1
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +6 -2
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +18 -0
- mteb/models/model_implementations/shuu_model.py +1 -0
- mteb/models/model_implementations/siglip_models.py +19 -4
- mteb/models/model_implementations/slm_models.py +7 -4
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +2 -0
- mteb/models/model_implementations/text2vec_models.py +3 -0
- mteb/models/model_implementations/ua_sentence_models.py +1 -0
- mteb/models/model_implementations/uae_models.py +10 -4
- mteb/models/model_implementations/vdr_models.py +8 -1
- mteb/models/model_implementations/vi_vn_models.py +6 -0
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +11 -4
- mteb/models/model_implementations/voyage_models.py +52 -4
- mteb/models/model_implementations/voyage_v.py +11 -6
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +1 -0
- mteb/models/model_implementations/yuan_models_en.py +2 -1
- mteb/models/model_meta.py +47 -9
- mteb/models/models_protocols.py +23 -18
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
- mteb/models/search_wrappers.py +31 -12
- mteb/models/sentence_transformer_wrapper.py +4 -3
- mteb/models/vllm_wrapper.py +8 -6
- mteb/results/benchmark_results.py +22 -17
- mteb/results/model_result.py +21 -15
- mteb/results/task_result.py +32 -16
- mteb/similarity_functions.py +8 -2
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +4 -4
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +42 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +1 -1
- mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/_encoder_io.py +1 -1
- mteb/types/statistics.py +9 -2
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/METADATA +1 -1
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/RECORD +486 -465
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
|
@@ -1,27 +1,31 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from collections.abc import Sequence
|
|
3
4
|
from typing import TYPE_CHECKING, Any
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
7
|
import torch
|
|
7
|
-
from torch.utils.data import DataLoader
|
|
8
8
|
|
|
9
9
|
import mteb
|
|
10
10
|
from mteb._create_dataloaders import _corpus_to_dict
|
|
11
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
12
11
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
13
|
-
from mteb.models.models_protocols import PromptType
|
|
14
12
|
from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
|
|
15
|
-
from mteb.types import
|
|
13
|
+
from mteb.types import PromptType
|
|
16
14
|
|
|
17
15
|
from .bge_models import bge_full_data
|
|
18
16
|
|
|
19
17
|
if TYPE_CHECKING:
|
|
18
|
+
from collections.abc import Sequence
|
|
19
|
+
|
|
20
|
+
from torch.utils.data import DataLoader
|
|
21
|
+
|
|
20
22
|
from mteb.abstasks import (
|
|
21
23
|
AbsTaskClassification,
|
|
22
24
|
AbsTaskRetrieval,
|
|
23
25
|
AbsTaskSummarization,
|
|
24
26
|
)
|
|
27
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
28
|
+
from mteb.types import Array, BatchedInput
|
|
25
29
|
logger = logging.getLogger(__name__)
|
|
26
30
|
|
|
27
31
|
CDE_CITATION = """@misc{morris2024contextualdocumentembeddings,
|
|
@@ -222,6 +226,7 @@ cde_small_v1 = ModelMeta(
|
|
|
222
226
|
revision="e151df18af0d7f1d1c37b074fee58406ececf19f",
|
|
223
227
|
release_date="2024-09-24",
|
|
224
228
|
n_parameters=int(281 * 1e6),
|
|
229
|
+
n_embedding_parameters=None,
|
|
225
230
|
memory_usage_mb=1072, # Though the second-stage model is only 140M
|
|
226
231
|
max_tokens=512,
|
|
227
232
|
embed_dim=768,
|
|
@@ -251,6 +256,7 @@ cde_small_v2 = ModelMeta(
|
|
|
251
256
|
revision="4e1d021a6c3fd7ce8aa0a7204057eee5ae61d390",
|
|
252
257
|
release_date="2025-01-13",
|
|
253
258
|
n_parameters=int(306 * 1e6),
|
|
259
|
+
n_embedding_parameters=None,
|
|
254
260
|
memory_usage_mb=1166, # Though the second-stage model is only 140M
|
|
255
261
|
max_tokens=512,
|
|
256
262
|
embed_dim=768,
|
|
@@ -1,13 +1,18 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
|
-
from torch.utils.data import DataLoader
|
|
5
6
|
from tqdm.auto import tqdm
|
|
6
7
|
|
|
7
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
8
8
|
from mteb.models.abs_encoder import AbsEncoder
|
|
9
9
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
10
|
-
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from torch.utils.data import DataLoader
|
|
13
|
+
|
|
14
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
11
16
|
|
|
12
17
|
|
|
13
18
|
class CLIPModel(AbsEncoder):
|
|
@@ -123,6 +128,7 @@ clip_vit_large_patch14 = ModelMeta(
|
|
|
123
128
|
release_date="2021-02-26",
|
|
124
129
|
modalities=["image", "text"],
|
|
125
130
|
n_parameters=428_000_000,
|
|
131
|
+
n_embedding_parameters=None,
|
|
126
132
|
memory_usage_mb=1631,
|
|
127
133
|
max_tokens=77,
|
|
128
134
|
embed_dim=768,
|
|
@@ -147,6 +153,7 @@ clip_vit_base_patch32 = ModelMeta(
|
|
|
147
153
|
release_date="2021-02-26",
|
|
148
154
|
modalities=["image", "text"],
|
|
149
155
|
n_parameters=151_000_000,
|
|
156
|
+
n_embedding_parameters=None,
|
|
150
157
|
memory_usage_mb=576,
|
|
151
158
|
max_tokens=77,
|
|
152
159
|
embed_dim=512,
|
|
@@ -171,6 +178,7 @@ clip_vit_base_patch16 = ModelMeta(
|
|
|
171
178
|
release_date="2021-02-26",
|
|
172
179
|
modalities=["image", "text"],
|
|
173
180
|
n_parameters=151_000_000,
|
|
181
|
+
n_embedding_parameters=None,
|
|
174
182
|
memory_usage_mb=576,
|
|
175
183
|
max_tokens=77,
|
|
176
184
|
embed_dim=512,
|
|
@@ -30,6 +30,7 @@ e5_nl_small = ModelMeta(
|
|
|
30
30
|
revision="0243664a6c5e12eef854b091eb283e51833c3e9f",
|
|
31
31
|
release_date="2025-09-23",
|
|
32
32
|
n_parameters=40_800_000,
|
|
33
|
+
n_embedding_parameters=19_200_768,
|
|
33
34
|
memory_usage_mb=78,
|
|
34
35
|
embed_dim=384,
|
|
35
36
|
license="mit",
|
|
@@ -57,6 +58,7 @@ e5_nl_base = ModelMeta(
|
|
|
57
58
|
revision="6bd5722f236da48b4b8bcb28cc1fc478f7089956",
|
|
58
59
|
release_date="2025-09-23",
|
|
59
60
|
n_parameters=124_400_000,
|
|
61
|
+
n_embedding_parameters=38_401_536,
|
|
60
62
|
memory_usage_mb=237,
|
|
61
63
|
embed_dim=768,
|
|
62
64
|
license="mit",
|
|
@@ -84,6 +86,7 @@ e5_nl_large = ModelMeta(
|
|
|
84
86
|
revision="683333f86ed9eb3699b5567f0fdabeb958d412b0",
|
|
85
87
|
release_date="2025-09-23",
|
|
86
88
|
n_parameters=355_000_000,
|
|
89
|
+
n_embedding_parameters=51_202_048,
|
|
87
90
|
memory_usage_mb=1355,
|
|
88
91
|
embed_dim=1024,
|
|
89
92
|
license="mit",
|
|
@@ -236,6 +236,7 @@ F2LLM_0B6 = ModelMeta(
|
|
|
236
236
|
revision="36416618b83d4bd84a8ca30c2ee01ed518f9f2e7",
|
|
237
237
|
release_date="2025-09-18",
|
|
238
238
|
n_parameters=595_776_512,
|
|
239
|
+
n_embedding_parameters=None,
|
|
239
240
|
memory_usage_mb=1137,
|
|
240
241
|
embed_dim=1024,
|
|
241
242
|
license="apache-2.0",
|
|
@@ -266,6 +267,7 @@ F2LLM_1B7 = ModelMeta(
|
|
|
266
267
|
revision="fdce0e09655f42cea26f7f66f5a70cd4507ea45c",
|
|
267
268
|
release_date="2025-09-18",
|
|
268
269
|
n_parameters=1_720_574_976,
|
|
270
|
+
n_embedding_parameters=None,
|
|
269
271
|
memory_usage_mb=3282,
|
|
270
272
|
embed_dim=2560,
|
|
271
273
|
license="apache-2.0",
|
|
@@ -296,6 +298,7 @@ F2LLM_4B = ModelMeta(
|
|
|
296
298
|
revision="9fe95901ed2b6b59dd7673d6e93c9d76766a1e25",
|
|
297
299
|
release_date="2025-09-18",
|
|
298
300
|
n_parameters=4_021_774_336,
|
|
301
|
+
n_embedding_parameters=None,
|
|
299
302
|
memory_usage_mb=7672,
|
|
300
303
|
embed_dim=2560,
|
|
301
304
|
license="apache-2.0",
|
|
@@ -318,6 +321,7 @@ C2LLM_0B5 = ModelMeta(
|
|
|
318
321
|
release_date="2025-12-22",
|
|
319
322
|
languages=c2llm_languages,
|
|
320
323
|
n_parameters=497252096,
|
|
324
|
+
n_embedding_parameters=None,
|
|
321
325
|
memory_usage_mb=948.0,
|
|
322
326
|
max_tokens=32768,
|
|
323
327
|
embed_dim=896,
|
|
@@ -346,6 +350,7 @@ C2LLM_7B = ModelMeta(
|
|
|
346
350
|
release_date="2025-12-22",
|
|
347
351
|
languages=c2llm_languages,
|
|
348
352
|
n_parameters=7667028992,
|
|
353
|
+
n_embedding_parameters=None,
|
|
349
354
|
memory_usage_mb=14624.0,
|
|
350
355
|
max_tokens=32768,
|
|
351
356
|
embed_dim=3584,
|
|
@@ -28,6 +28,7 @@ codesage_large = ModelMeta(
|
|
|
28
28
|
release_date="2024-02-03",
|
|
29
29
|
modalities=["text"],
|
|
30
30
|
n_parameters=1_300_000_000,
|
|
31
|
+
n_embedding_parameters=100_667_392,
|
|
31
32
|
memory_usage_mb=4959,
|
|
32
33
|
max_tokens=2048,
|
|
33
34
|
embed_dim=2048,
|
|
@@ -55,6 +56,7 @@ codesage_base = ModelMeta(
|
|
|
55
56
|
release_date="2024-02-03",
|
|
56
57
|
modalities=["text"],
|
|
57
58
|
n_parameters=356_000_000,
|
|
59
|
+
n_embedding_parameters=50_333_696,
|
|
58
60
|
memory_usage_mb=1358,
|
|
59
61
|
max_tokens=2048,
|
|
60
62
|
embed_dim=1024,
|
|
@@ -82,6 +84,7 @@ codesage_small = ModelMeta(
|
|
|
82
84
|
release_date="2024-02-03",
|
|
83
85
|
modalities=["text"],
|
|
84
86
|
n_parameters=130_000_000,
|
|
87
|
+
n_embedding_parameters=50_333_696,
|
|
85
88
|
memory_usage_mb=496,
|
|
86
89
|
max_tokens=2048,
|
|
87
90
|
embed_dim=1024,
|
|
@@ -1,18 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import time
|
|
3
5
|
from functools import wraps
|
|
4
|
-
from typing import Any, Literal, get_args
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal, get_args
|
|
5
7
|
|
|
6
8
|
import numpy as np
|
|
7
9
|
import torch
|
|
8
|
-
from torch.utils.data import DataLoader
|
|
9
10
|
from tqdm.auto import tqdm
|
|
10
11
|
|
|
11
12
|
from mteb._requires_package import requires_package
|
|
12
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
13
13
|
from mteb.models.abs_encoder import AbsEncoder
|
|
14
14
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
15
|
-
from mteb.types import
|
|
15
|
+
from mteb.types import PromptType
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from torch.utils.data import DataLoader
|
|
19
|
+
|
|
20
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
21
|
+
from mteb.types import Array, BatchedInput
|
|
16
22
|
|
|
17
23
|
logger = logging.getLogger(__name__)
|
|
18
24
|
|
|
@@ -386,6 +392,7 @@ cohere_mult_3 = ModelMeta(
|
|
|
386
392
|
revision="1",
|
|
387
393
|
release_date="2023-11-02",
|
|
388
394
|
n_parameters=None,
|
|
395
|
+
n_embedding_parameters=None,
|
|
389
396
|
memory_usage_mb=None,
|
|
390
397
|
max_tokens=None,
|
|
391
398
|
embed_dim=512,
|
|
@@ -412,6 +419,7 @@ cohere_eng_3 = ModelMeta(
|
|
|
412
419
|
revision="1",
|
|
413
420
|
release_date="2023-11-02",
|
|
414
421
|
n_parameters=None,
|
|
422
|
+
n_embedding_parameters=None,
|
|
415
423
|
memory_usage_mb=None,
|
|
416
424
|
max_tokens=512,
|
|
417
425
|
embed_dim=1024,
|
|
@@ -437,6 +445,7 @@ cohere_mult_light_3 = ModelMeta(
|
|
|
437
445
|
reference="https://cohere.com/blog/introducing-embed-v3",
|
|
438
446
|
release_date="2023-11-02",
|
|
439
447
|
n_parameters=None,
|
|
448
|
+
n_embedding_parameters=None,
|
|
440
449
|
memory_usage_mb=None,
|
|
441
450
|
max_tokens=512,
|
|
442
451
|
embed_dim=384,
|
|
@@ -462,6 +471,7 @@ cohere_eng_light_3 = ModelMeta(
|
|
|
462
471
|
revision="1",
|
|
463
472
|
release_date="2023-11-02",
|
|
464
473
|
n_parameters=None,
|
|
474
|
+
n_embedding_parameters=None,
|
|
465
475
|
memory_usage_mb=None,
|
|
466
476
|
max_tokens=512,
|
|
467
477
|
embed_dim=384,
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import base64
|
|
2
4
|
import io
|
|
3
5
|
import os
|
|
4
6
|
import time
|
|
5
|
-
from typing import Any, Literal, get_args
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Literal, get_args
|
|
6
8
|
|
|
7
9
|
import torch
|
|
8
|
-
from torch.utils.data import DataLoader
|
|
9
10
|
from tqdm.auto import tqdm
|
|
10
11
|
|
|
11
12
|
from mteb._requires_package import requires_image_dependencies, requires_package
|
|
12
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
13
13
|
from mteb.models import ModelMeta
|
|
14
14
|
from mteb.models.abs_encoder import AbsEncoder
|
|
15
15
|
from mteb.models.model_implementations.cohere_models import (
|
|
@@ -18,7 +18,12 @@ from mteb.models.model_implementations.cohere_models import (
|
|
|
18
18
|
retry_with_rate_limit,
|
|
19
19
|
)
|
|
20
20
|
from mteb.models.model_meta import ScoringFunction
|
|
21
|
-
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from torch.utils.data import DataLoader
|
|
24
|
+
|
|
25
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
26
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
22
27
|
|
|
23
28
|
|
|
24
29
|
def _post_process_embeddings(
|
|
@@ -386,6 +391,7 @@ cohere_mult_3 = ModelMeta(
|
|
|
386
391
|
revision="1",
|
|
387
392
|
release_date="2024-10-24",
|
|
388
393
|
n_parameters=None,
|
|
394
|
+
n_embedding_parameters=None,
|
|
389
395
|
memory_usage_mb=None,
|
|
390
396
|
max_tokens=None,
|
|
391
397
|
embed_dim=1024,
|
|
@@ -410,6 +416,7 @@ cohere_eng_3 = ModelMeta(
|
|
|
410
416
|
revision="1",
|
|
411
417
|
release_date="2024-10-24",
|
|
412
418
|
n_parameters=None,
|
|
419
|
+
n_embedding_parameters=None,
|
|
413
420
|
memory_usage_mb=None,
|
|
414
421
|
max_tokens=None,
|
|
415
422
|
embed_dim=1024,
|
|
@@ -434,6 +441,7 @@ cohere_embed_v4_multimodal = ModelMeta(
|
|
|
434
441
|
revision="1",
|
|
435
442
|
release_date="2024-12-01",
|
|
436
443
|
n_parameters=None,
|
|
444
|
+
n_embedding_parameters=None,
|
|
437
445
|
memory_usage_mb=None,
|
|
438
446
|
max_tokens=128000,
|
|
439
447
|
embed_dim=1536,
|
|
@@ -458,6 +466,7 @@ cohere_embed_v4_multimodal_binary = ModelMeta(
|
|
|
458
466
|
revision="1",
|
|
459
467
|
release_date="2024-12-01",
|
|
460
468
|
n_parameters=None,
|
|
469
|
+
n_embedding_parameters=None,
|
|
461
470
|
memory_usage_mb=None,
|
|
462
471
|
max_tokens=128000,
|
|
463
472
|
embed_dim=1536,
|
|
@@ -483,6 +492,7 @@ cohere_embed_v4_multimodal_int8 = ModelMeta(
|
|
|
483
492
|
revision="1",
|
|
484
493
|
release_date="2024-12-01",
|
|
485
494
|
n_parameters=None,
|
|
495
|
+
n_embedding_parameters=None,
|
|
486
496
|
memory_usage_mb=None,
|
|
487
497
|
max_tokens=128000,
|
|
488
498
|
embed_dim=1536,
|
|
@@ -4,20 +4,21 @@ import logging
|
|
|
4
4
|
from typing import TYPE_CHECKING, Any
|
|
5
5
|
|
|
6
6
|
import torch
|
|
7
|
-
from torch.utils.data import DataLoader
|
|
8
7
|
from tqdm.auto import tqdm
|
|
9
8
|
|
|
10
9
|
from mteb._requires_package import (
|
|
11
10
|
requires_image_dependencies,
|
|
12
11
|
requires_package,
|
|
13
12
|
)
|
|
14
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
13
|
from mteb.models.abs_encoder import AbsEncoder
|
|
16
14
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
17
|
-
from mteb.types import Array, BatchedInput, PromptType
|
|
18
15
|
|
|
19
16
|
if TYPE_CHECKING:
|
|
20
17
|
from PIL import Image
|
|
18
|
+
from torch.utils.data import DataLoader
|
|
19
|
+
|
|
20
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
21
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
21
22
|
|
|
22
23
|
logger = logging.getLogger(__name__)
|
|
23
24
|
|
|
@@ -219,6 +220,7 @@ colpali_v1_1 = ModelMeta(
|
|
|
219
220
|
release_date="2024-08-21",
|
|
220
221
|
modalities=["image", "text"],
|
|
221
222
|
n_parameters=2_920_000_000,
|
|
223
|
+
n_embedding_parameters=None,
|
|
222
224
|
memory_usage_mb=4700,
|
|
223
225
|
max_tokens=16384,
|
|
224
226
|
embed_dim=128,
|
|
@@ -246,6 +248,7 @@ colpali_v1_2 = ModelMeta(
|
|
|
246
248
|
release_date="2024-08-26",
|
|
247
249
|
modalities=["image", "text"],
|
|
248
250
|
n_parameters=2_920_000_000,
|
|
251
|
+
n_embedding_parameters=None,
|
|
249
252
|
memory_usage_mb=4700,
|
|
250
253
|
max_tokens=16384,
|
|
251
254
|
embed_dim=128,
|
|
@@ -273,6 +276,7 @@ colpali_v1_3 = ModelMeta(
|
|
|
273
276
|
release_date="2024-11-01",
|
|
274
277
|
modalities=["image", "text"],
|
|
275
278
|
n_parameters=2_920_000_000,
|
|
279
|
+
n_embedding_parameters=None,
|
|
276
280
|
memory_usage_mb=4700,
|
|
277
281
|
max_tokens=16384,
|
|
278
282
|
embed_dim=128,
|
|
@@ -1,18 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from torch.utils.data import DataLoader
|
|
6
7
|
from tqdm.auto import tqdm
|
|
7
8
|
|
|
8
9
|
from mteb._requires_package import (
|
|
9
10
|
requires_image_dependencies,
|
|
10
11
|
requires_package,
|
|
11
12
|
)
|
|
12
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
13
13
|
from mteb.models.abs_encoder import AbsEncoder
|
|
14
14
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
15
|
-
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from torch.utils.data import DataLoader
|
|
18
|
+
|
|
19
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
20
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
16
21
|
|
|
17
22
|
from .colpali_models import (
|
|
18
23
|
COLPALI_CITATION,
|
|
@@ -219,6 +224,7 @@ colqwen2 = ModelMeta(
|
|
|
219
224
|
release_date="2025-11-03",
|
|
220
225
|
modalities=["image", "text"],
|
|
221
226
|
n_parameters=2_210_000_000,
|
|
227
|
+
n_embedding_parameters=None,
|
|
222
228
|
memory_usage_mb=7200,
|
|
223
229
|
max_tokens=32768,
|
|
224
230
|
embed_dim=128,
|
|
@@ -246,6 +252,7 @@ colqwen2_5 = ModelMeta(
|
|
|
246
252
|
release_date="2025-01-31",
|
|
247
253
|
modalities=["image", "text"],
|
|
248
254
|
n_parameters=3_000_000_000,
|
|
255
|
+
n_embedding_parameters=None,
|
|
249
256
|
memory_usage_mb=7200,
|
|
250
257
|
max_tokens=128000,
|
|
251
258
|
embed_dim=128,
|
|
@@ -290,6 +297,7 @@ colqwen3_8b = ModelMeta(
|
|
|
290
297
|
release_date="2025-11-26",
|
|
291
298
|
modalities=["image", "text"],
|
|
292
299
|
n_parameters=8_000_000_000,
|
|
300
|
+
n_embedding_parameters=None,
|
|
293
301
|
memory_usage_mb=16724,
|
|
294
302
|
max_tokens=262144,
|
|
295
303
|
embed_dim=320,
|
|
@@ -314,6 +322,7 @@ colqwen3_4b = ModelMeta(
|
|
|
314
322
|
release_date="2025-11-26",
|
|
315
323
|
modalities=["image", "text"],
|
|
316
324
|
n_parameters=4_000_000_000,
|
|
325
|
+
n_embedding_parameters=None,
|
|
317
326
|
memory_usage_mb=8466,
|
|
318
327
|
max_tokens=262144,
|
|
319
328
|
embed_dim=320,
|
|
@@ -329,32 +338,6 @@ colqwen3_4b = ModelMeta(
|
|
|
329
338
|
citation=TOMORO_CITATION,
|
|
330
339
|
)
|
|
331
340
|
|
|
332
|
-
colnomic_7b = ModelMeta(
|
|
333
|
-
loader=ColQwen2_5Wrapper,
|
|
334
|
-
loader_kwargs=dict(
|
|
335
|
-
torch_dtype=torch.float16,
|
|
336
|
-
),
|
|
337
|
-
name="nomic-ai/colnomic-embed-multimodal-7b",
|
|
338
|
-
model_type=["late-interaction"],
|
|
339
|
-
languages=["eng-Latn"],
|
|
340
|
-
revision="530094e83a40ca4edcb5c9e5ddfa61a4b5ea0d2f",
|
|
341
|
-
release_date="2025-03-31",
|
|
342
|
-
modalities=["image", "text"],
|
|
343
|
-
n_parameters=7_000_000_000,
|
|
344
|
-
memory_usage_mb=14400,
|
|
345
|
-
max_tokens=128000,
|
|
346
|
-
embed_dim=128,
|
|
347
|
-
license="apache-2.0",
|
|
348
|
-
open_weights=True,
|
|
349
|
-
public_training_code="https://github.com/nomic-ai/colpali",
|
|
350
|
-
public_training_data="https://huggingface.co/datasets/vidore/colpali_train_set",
|
|
351
|
-
framework=["ColPali", "safetensors"],
|
|
352
|
-
reference="https://huggingface.co/nomic-ai/colnomic-embed-multimodal-7b",
|
|
353
|
-
similarity_fn_name="MaxSim",
|
|
354
|
-
use_instructions=True,
|
|
355
|
-
training_datasets=COLPALI_TRAINING_DATA,
|
|
356
|
-
citation=COLPALI_CITATION,
|
|
357
|
-
)
|
|
358
341
|
|
|
359
342
|
COLNOMIC_CITATION = """
|
|
360
343
|
@misc{nomicembedmultimodal2025,
|
|
@@ -386,6 +369,7 @@ colnomic_3b = ModelMeta(
|
|
|
386
369
|
release_date="2025-03-31",
|
|
387
370
|
modalities=["image", "text"],
|
|
388
371
|
n_parameters=3_000_000_000,
|
|
372
|
+
n_embedding_parameters=None,
|
|
389
373
|
memory_usage_mb=7200,
|
|
390
374
|
max_tokens=128000,
|
|
391
375
|
embed_dim=128,
|
|
@@ -402,7 +386,7 @@ colnomic_3b = ModelMeta(
|
|
|
402
386
|
)
|
|
403
387
|
|
|
404
388
|
colnomic_7b = ModelMeta(
|
|
405
|
-
loader=
|
|
389
|
+
loader=ColQwen2_5Wrapper,
|
|
406
390
|
loader_kwargs=dict(
|
|
407
391
|
torch_dtype=torch.float16,
|
|
408
392
|
),
|
|
@@ -451,6 +435,7 @@ evoqwen25_vl_retriever_3b_v1 = ModelMeta(
|
|
|
451
435
|
release_date="2025-11-04",
|
|
452
436
|
modalities=["image", "text"],
|
|
453
437
|
n_parameters=3_000_000_000,
|
|
438
|
+
n_embedding_parameters=None,
|
|
454
439
|
memory_usage_mb=7200,
|
|
455
440
|
max_tokens=128000,
|
|
456
441
|
embed_dim=128,
|
|
@@ -477,6 +462,7 @@ evoqwen25_vl_retriever_7b_v1 = ModelMeta(
|
|
|
477
462
|
release_date="2025-11-04",
|
|
478
463
|
modalities=["image", "text"],
|
|
479
464
|
n_parameters=7_000_000_000,
|
|
465
|
+
n_embedding_parameters=None,
|
|
480
466
|
memory_usage_mb=14400,
|
|
481
467
|
max_tokens=128000,
|
|
482
468
|
embed_dim=128,
|
|
@@ -56,10 +56,11 @@ colsmol_256m = ModelMeta(
|
|
|
56
56
|
name="vidore/colSmol-256M",
|
|
57
57
|
model_type=["late-interaction"],
|
|
58
58
|
languages=["eng-Latn"],
|
|
59
|
-
revision="
|
|
59
|
+
revision="a59110fdf114638b8018e6c9a018907e12f14855",
|
|
60
60
|
release_date="2025-01-22",
|
|
61
61
|
modalities=["image", "text"],
|
|
62
62
|
n_parameters=256_000_000,
|
|
63
|
+
n_embedding_parameters=None,
|
|
63
64
|
memory_usage_mb=800,
|
|
64
65
|
max_tokens=8192,
|
|
65
66
|
embed_dim=128,
|
|
@@ -87,6 +88,7 @@ colsmol_500m = ModelMeta(
|
|
|
87
88
|
release_date="2025-01-22",
|
|
88
89
|
modalities=["image", "text"],
|
|
89
90
|
n_parameters=500_000_000,
|
|
91
|
+
n_embedding_parameters=None,
|
|
90
92
|
memory_usage_mb=1200,
|
|
91
93
|
max_tokens=8192,
|
|
92
94
|
embed_dim=128,
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import hashlib
|
|
2
4
|
import json
|
|
3
5
|
import logging
|
|
@@ -5,20 +7,24 @@ import os
|
|
|
5
7
|
import random
|
|
6
8
|
import string
|
|
7
9
|
import time
|
|
8
|
-
from typing import Any
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
9
11
|
|
|
10
12
|
import numpy as np
|
|
11
13
|
import requests
|
|
12
|
-
from torch.utils.data import DataLoader
|
|
13
14
|
|
|
14
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
15
|
from mteb.models.abs_encoder import AbsEncoder
|
|
16
16
|
from mteb.models.model_meta import ModelMeta
|
|
17
|
-
from mteb.types import Array, BatchedInput, PromptType
|
|
18
17
|
|
|
19
18
|
from .bge_models import bge_full_data
|
|
20
19
|
from .e5_instruct import E5_MISTRAL_TRAINING_DATA
|
|
21
20
|
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from torch.utils.data import DataLoader
|
|
23
|
+
|
|
24
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
25
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
26
|
+
|
|
27
|
+
|
|
22
28
|
conan_zh_datasets = {
|
|
23
29
|
"BQ",
|
|
24
30
|
"LCQMC",
|
|
@@ -205,6 +211,7 @@ Conan_embedding_v2 = ModelMeta(
|
|
|
205
211
|
embed_dim=3584,
|
|
206
212
|
open_weights=False,
|
|
207
213
|
n_parameters=None,
|
|
214
|
+
n_embedding_parameters=None,
|
|
208
215
|
memory_usage_mb=None,
|
|
209
216
|
license="apache-2.0",
|
|
210
217
|
reference="https://huggingface.co/TencentBAC/Conan-embedding-v2",
|