mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +78 -30
- mteb/_evaluators/any_sts_evaluator.py +13 -6
- mteb/_evaluators/clustering_evaluator.py +13 -5
- mteb/_evaluators/evaluator.py +12 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
- mteb/_evaluators/pair_classification_evaluator.py +17 -7
- mteb/_evaluators/retrieval_evaluator.py +23 -14
- mteb/_evaluators/retrieval_metrics.py +26 -19
- mteb/_evaluators/sklearn_evaluator.py +27 -17
- mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
- mteb/_evaluators/text/summarization_evaluator.py +31 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +9 -3
- mteb/abstasks/_data_filter/task_pipelines.py +10 -2
- mteb/abstasks/_statistics_calculation.py +21 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +78 -44
- mteb/abstasks/aggregate_task_metadata.py +21 -18
- mteb/abstasks/aggregated_task.py +23 -35
- mteb/abstasks/classification.py +39 -18
- mteb/abstasks/clustering.py +37 -20
- mteb/abstasks/clustering_legacy.py +30 -16
- mteb/abstasks/image/image_text_pair_classification.py +26 -9
- mteb/abstasks/multilabel_classification.py +33 -21
- mteb/abstasks/pair_classification.py +44 -19
- mteb/abstasks/regression.py +18 -10
- mteb/abstasks/retrieval.py +82 -52
- mteb/abstasks/retrieval_dataset_loaders.py +50 -39
- mteb/abstasks/sts.py +34 -15
- mteb/abstasks/task_metadata.py +44 -37
- mteb/abstasks/text/bitext_mining.py +57 -35
- mteb/abstasks/text/reranking.py +10 -8
- mteb/abstasks/text/summarization.py +26 -10
- mteb/abstasks/zeroshot_classification.py +27 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +15 -3
- mteb/benchmarks/benchmarks/__init__.py +6 -0
- mteb/benchmarks/benchmarks/benchmarks.py +153 -13
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +189 -31
- mteb/cli/_display_tasks.py +10 -4
- mteb/cli/build_cli.py +112 -13
- mteb/cli/generate_model_card.py +50 -23
- mteb/deprecated_evaluator.py +72 -54
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +71 -47
- mteb/filter_tasks.py +36 -32
- mteb/get_tasks.py +37 -33
- mteb/languages/language_scripts.py +11 -4
- mteb/leaderboard/app.py +172 -37
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +20 -14
- mteb/models/abs_encoder.py +30 -16
- mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +16 -11
- mteb/models/get_model_meta.py +53 -9
- mteb/models/instruct_wrapper.py +41 -13
- mteb/models/model_implementations/align_models.py +11 -5
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +6 -4
- mteb/models/model_implementations/ara_models.py +2 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +85 -22
- mteb/models/model_implementations/bica_model.py +4 -3
- mteb/models/model_implementations/blip2_models.py +13 -6
- mteb/models/model_implementations/blip_models.py +33 -20
- mteb/models/model_implementations/bm25.py +27 -17
- mteb/models/model_implementations/bmretriever_models.py +16 -6
- mteb/models/model_implementations/cadet_models.py +2 -1
- mteb/models/model_implementations/cde_models.py +22 -9
- mteb/models/model_implementations/clip_models.py +18 -10
- mteb/models/model_implementations/clips_models.py +6 -3
- mteb/models/model_implementations/codefuse_models.py +10 -5
- mteb/models/model_implementations/codesage_models.py +6 -3
- mteb/models/model_implementations/cohere_models.py +19 -9
- mteb/models/model_implementations/cohere_v.py +16 -6
- mteb/models/model_implementations/colpali_models.py +10 -6
- mteb/models/model_implementations/colqwen_models.py +24 -38
- mteb/models/model_implementations/colsmol_models.py +5 -3
- mteb/models/model_implementations/conan_models.py +12 -5
- mteb/models/model_implementations/dino_models.py +70 -46
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +18 -9
- mteb/models/model_implementations/e5_v.py +16 -10
- mteb/models/model_implementations/eagerworks_models.py +12 -5
- mteb/models/model_implementations/emillykkejensen_models.py +9 -6
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +3 -2
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +18 -9
- mteb/models/model_implementations/facebookai.py +16 -2
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +13 -8
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
- mteb/models/model_implementations/gritlm_models.py +5 -2
- mteb/models/model_implementations/gte_models.py +34 -13
- mteb/models/model_implementations/hinvec_models.py +7 -2
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +16 -7
- mteb/models/model_implementations/jina_clip.py +58 -14
- mteb/models/model_implementations/jina_models.py +35 -16
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +13 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
- mteb/models/model_implementations/kfst.py +2 -1
- mteb/models/model_implementations/kowshik24_models.py +2 -1
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +8 -2
- mteb/models/model_implementations/listconranker.py +11 -5
- mteb/models/model_implementations/llm2clip_models.py +18 -10
- mteb/models/model_implementations/llm2vec_models.py +28 -14
- mteb/models/model_implementations/mcinext_models.py +12 -3
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +131 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
- mteb/models/model_implementations/mme5_models.py +3 -2
- mteb/models/model_implementations/moco_models.py +15 -8
- mteb/models/model_implementations/mod_models.py +3 -2
- mteb/models/model_implementations/model2vec_models.py +37 -18
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +6 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
- mteb/models/model_implementations/nomic_models.py +47 -19
- mteb/models/model_implementations/nomic_models_vision.py +6 -4
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
- mteb/models/model_implementations/nvidia_models.py +165 -22
- mteb/models/model_implementations/octen_models.py +64 -3
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +30 -17
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
- mteb/models/model_implementations/ops_moa_models.py +10 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
- mteb/models/model_implementations/pawan_models.py +2 -1
- mteb/models/model_implementations/piccolo_models.py +3 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +20 -10
- mteb/models/model_implementations/pylate_models.py +41 -21
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +14 -4
- mteb/models/model_implementations/qzhou_models.py +4 -2
- mteb/models/model_implementations/random_baseline.py +7 -6
- mteb/models/model_implementations/rasgaard_models.py +3 -2
- mteb/models/model_implementations/reasonir_model.py +66 -1
- mteb/models/model_implementations/repllama_models.py +18 -9
- mteb/models/model_implementations/rerankers_custom.py +25 -10
- mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +40 -20
- mteb/models/model_implementations/ruri_models.py +20 -10
- mteb/models/model_implementations/salesforce_models.py +13 -4
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +142 -22
- mteb/models/model_implementations/shuu_model.py +2 -1
- mteb/models/model_implementations/siglip_models.py +39 -24
- mteb/models/model_implementations/slm_models.py +419 -0
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +4 -2
- mteb/models/model_implementations/text2vec_models.py +12 -3
- mteb/models/model_implementations/ua_sentence_models.py +2 -1
- mteb/models/model_implementations/uae_models.py +17 -5
- mteb/models/model_implementations/vdr_models.py +9 -2
- mteb/models/model_implementations/vi_vn_models.py +12 -6
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +14 -7
- mteb/models/model_implementations/voyage_models.py +136 -4
- mteb/models/model_implementations/voyage_v.py +17 -10
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +2 -1
- mteb/models/model_implementations/yuan_models_en.py +3 -2
- mteb/models/model_meta.py +127 -40
- mteb/models/models_protocols.py +43 -22
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
- mteb/models/search_wrappers.py +63 -29
- mteb/models/sentence_transformer_wrapper.py +52 -26
- mteb/models/vllm_wrapper.py +329 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +48 -35
- mteb/results/model_result.py +68 -32
- mteb/results/task_result.py +110 -72
- mteb/similarity_functions.py +19 -9
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +2 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +16 -16
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +44 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +3 -3
- mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +13 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +18 -5
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,18 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
|
-
from torch.utils.data import DataLoader
|
|
5
6
|
|
|
6
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
7
7
|
from mteb.models.abs_encoder import AbsEncoder
|
|
8
8
|
from mteb.models.model_meta import ModelMeta
|
|
9
|
-
from mteb.types import
|
|
9
|
+
from mteb.types import PromptType
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from torch.utils.data import DataLoader
|
|
13
|
+
|
|
14
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
|
+
from mteb.types import Array, BatchedInput
|
|
10
16
|
|
|
11
17
|
v2_training_data = {
|
|
12
18
|
"MSMARCO",
|
|
@@ -134,13 +140,14 @@ opensearch_neural_sparse_encoding_doc_v3_gte = ModelMeta(
|
|
|
134
140
|
revision="a8abaa916125ee512a7a8f4d706d07eb0128a8e6",
|
|
135
141
|
release_date="2025-06-18",
|
|
136
142
|
n_parameters=137_394_234,
|
|
143
|
+
n_embedding_parameters=23_440_896,
|
|
137
144
|
memory_usage_mb=549,
|
|
138
145
|
embed_dim=30522,
|
|
139
146
|
license="apache-2.0",
|
|
140
147
|
max_tokens=8192,
|
|
141
148
|
reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte",
|
|
142
149
|
similarity_fn_name="dot",
|
|
143
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
150
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
144
151
|
public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample",
|
|
145
152
|
public_training_data=True,
|
|
146
153
|
use_instructions=True,
|
|
@@ -160,13 +167,14 @@ opensearch_neural_sparse_encoding_doc_v3_distill = ModelMeta(
|
|
|
160
167
|
revision="babf71f3c48695e2e53a978208e8aba48335e3c0",
|
|
161
168
|
release_date="2025-03-28",
|
|
162
169
|
n_parameters=66_985_530,
|
|
170
|
+
n_embedding_parameters=23_440_896,
|
|
163
171
|
memory_usage_mb=267,
|
|
164
172
|
embed_dim=30522,
|
|
165
173
|
license="apache-2.0",
|
|
166
174
|
max_tokens=512,
|
|
167
175
|
reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
|
|
168
176
|
similarity_fn_name="dot",
|
|
169
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
177
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
170
178
|
public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample",
|
|
171
179
|
public_training_data=True,
|
|
172
180
|
use_instructions=True,
|
|
@@ -182,13 +190,14 @@ opensearch_neural_sparse_encoding_doc_v2_distill = ModelMeta(
|
|
|
182
190
|
revision="8921a26c78b8559d6604eb1f5c0b74c079bee38f",
|
|
183
191
|
release_date="2024-07-17",
|
|
184
192
|
n_parameters=66_985_530,
|
|
193
|
+
n_embedding_parameters=23_440_896,
|
|
185
194
|
memory_usage_mb=267,
|
|
186
195
|
embed_dim=30522,
|
|
187
196
|
license="apache-2.0",
|
|
188
197
|
max_tokens=512,
|
|
189
198
|
reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill",
|
|
190
199
|
similarity_fn_name="dot",
|
|
191
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
200
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
192
201
|
public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample",
|
|
193
202
|
public_training_data=True,
|
|
194
203
|
use_instructions=True,
|
|
@@ -205,13 +214,14 @@ opensearch_neural_sparse_encoding_doc_v2_mini = ModelMeta(
|
|
|
205
214
|
revision="4af867a426867dfdd744097531046f4289a32fdd",
|
|
206
215
|
release_date="2024-07-18",
|
|
207
216
|
n_parameters=22_744_506,
|
|
217
|
+
n_embedding_parameters=11_720_448,
|
|
208
218
|
memory_usage_mb=86,
|
|
209
219
|
embed_dim=30522,
|
|
210
220
|
license="apache-2.0",
|
|
211
221
|
max_tokens=512,
|
|
212
222
|
reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini",
|
|
213
223
|
similarity_fn_name="dot",
|
|
214
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
224
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
215
225
|
public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample",
|
|
216
226
|
public_training_data=True,
|
|
217
227
|
use_instructions=True,
|
|
@@ -227,13 +237,14 @@ opensearch_neural_sparse_encoding_doc_v1 = ModelMeta(
|
|
|
227
237
|
revision="98cdcbd72867c547f72f2b7b7bed9cdf9f09922d",
|
|
228
238
|
release_date="2024-03-07",
|
|
229
239
|
n_parameters=132_955_194,
|
|
240
|
+
n_embedding_parameters=23_440_896,
|
|
230
241
|
memory_usage_mb=507,
|
|
231
242
|
embed_dim=30522,
|
|
232
243
|
license="apache-2.0",
|
|
233
244
|
max_tokens=512,
|
|
234
245
|
reference="https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v1",
|
|
235
246
|
similarity_fn_name="dot",
|
|
236
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
247
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
237
248
|
public_training_code="https://github.com/zhichao-aws/opensearch-sparse-model-tuning-sample",
|
|
238
249
|
public_training_data=True,
|
|
239
250
|
use_instructions=True,
|
|
@@ -1,8 +1,13 @@
|
|
|
1
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
2
4
|
|
|
3
5
|
from mteb.models.abs_encoder import AbsEncoder
|
|
4
6
|
from mteb.models.model_meta import ModelMeta
|
|
5
7
|
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from mteb.types import Array
|
|
10
|
+
|
|
6
11
|
|
|
7
12
|
class OPSWrapper(AbsEncoder):
|
|
8
13
|
def __init__(self, model_name: str, revision: str):
|
|
@@ -15,7 +20,7 @@ class OPSWrapper(AbsEncoder):
|
|
|
15
20
|
)
|
|
16
21
|
self.output_dim = 1536
|
|
17
22
|
|
|
18
|
-
def encode(self, sentences: list[str], **kwargs) ->
|
|
23
|
+
def encode(self, sentences: list[str], **kwargs) -> Array:
|
|
19
24
|
embeddings = self.model.encode(sentences, **kwargs)
|
|
20
25
|
return embeddings[:, : self.output_dim]
|
|
21
26
|
|
|
@@ -28,6 +33,7 @@ ops_moa_conan_embedding = ModelMeta(
|
|
|
28
33
|
languages=["zho-Hans"],
|
|
29
34
|
loader=OPSWrapper,
|
|
30
35
|
n_parameters=int(343 * 1e6),
|
|
36
|
+
n_embedding_parameters=21_635_072,
|
|
31
37
|
memory_usage_mb=1308,
|
|
32
38
|
max_tokens=512,
|
|
33
39
|
embed_dim=1536,
|
|
@@ -60,6 +66,7 @@ ops_moa_yuan_embedding = ModelMeta(
|
|
|
60
66
|
languages=["zho-Hans"],
|
|
61
67
|
loader=OPSWrapper,
|
|
62
68
|
n_parameters=int(343 * 1e6),
|
|
69
|
+
n_embedding_parameters=21_635_072,
|
|
63
70
|
memory_usage_mb=1242,
|
|
64
71
|
max_tokens=512,
|
|
65
72
|
embed_dim=1536,
|
|
@@ -67,7 +74,7 @@ ops_moa_yuan_embedding = ModelMeta(
|
|
|
67
74
|
open_weights=True,
|
|
68
75
|
public_training_code=None,
|
|
69
76
|
public_training_data=None,
|
|
70
|
-
framework=["PyTorch", "Sentence Transformers"],
|
|
77
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors"],
|
|
71
78
|
reference="https://huggingface.co/OpenSearch-AI/Ops-MoA-Yuan-embedding-1.0",
|
|
72
79
|
similarity_fn_name="cosine",
|
|
73
80
|
use_instructions=False,
|
|
@@ -4,6 +4,7 @@ solon_embeddings_1_1 = ModelMeta(
|
|
|
4
4
|
name="OrdalieTech/Solon-embeddings-mini-beta-1.1",
|
|
5
5
|
languages=["fra-Latn"],
|
|
6
6
|
n_parameters=210_000_000,
|
|
7
|
+
n_embedding_parameters=None,
|
|
7
8
|
public_training_code=None,
|
|
8
9
|
memory_usage_mb=808.0,
|
|
9
10
|
open_weights=True,
|
|
@@ -14,7 +15,7 @@ solon_embeddings_1_1 = ModelMeta(
|
|
|
14
15
|
max_tokens=8192,
|
|
15
16
|
reference="https://huggingface.co/OrdalieTech/Solon-embeddings-mini-beta-1.1",
|
|
16
17
|
similarity_fn_name="cosine",
|
|
17
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
18
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
18
19
|
use_instructions=False,
|
|
19
20
|
public_training_data=(
|
|
20
21
|
"https://huggingface.co/datasets/PleIAs/common_corpus; "
|
|
@@ -20,13 +20,14 @@ pawan_embd_68m = ModelMeta(
|
|
|
20
20
|
revision="32f295145802bdbd65699ad65fd27d2a5b69a909",
|
|
21
21
|
release_date="2025-12-08",
|
|
22
22
|
n_parameters=68_000_000,
|
|
23
|
+
n_embedding_parameters=None,
|
|
23
24
|
memory_usage_mb=260,
|
|
24
25
|
embed_dim=768,
|
|
25
26
|
license="apache-2.0",
|
|
26
27
|
max_tokens=512,
|
|
27
28
|
reference="https://huggingface.co/dmedhi/PawanEmbd-68M",
|
|
28
29
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
29
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
30
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
30
31
|
adapted_from="ibm-granite/granite-embedding-278m-multilingual",
|
|
31
32
|
superseded_by=None,
|
|
32
33
|
public_training_code=None,
|
|
@@ -12,13 +12,14 @@ piccolo_base_zh = ModelMeta(
|
|
|
12
12
|
revision="47c0a63b8f667c3482e05b2fd45577bb19252196",
|
|
13
13
|
release_date="2023-09-04", # first commit
|
|
14
14
|
n_parameters=None,
|
|
15
|
+
n_embedding_parameters=16_226_304,
|
|
15
16
|
memory_usage_mb=None, # can't see on model card
|
|
16
17
|
embed_dim=768,
|
|
17
18
|
license="mit",
|
|
18
19
|
max_tokens=512,
|
|
19
20
|
reference="https://huggingface.co/sensenova/piccolo-base-zh",
|
|
20
21
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
21
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
22
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers"],
|
|
22
23
|
use_instructions=False,
|
|
23
24
|
superseded_by=None,
|
|
24
25
|
adapted_from=None,
|
|
@@ -37,6 +38,7 @@ piccolo_large_zh_v2 = ModelMeta(
|
|
|
37
38
|
revision="05948c1d889355936bdf9db7d30df57dd78d25a3",
|
|
38
39
|
release_date="2024-04-22", # first commit
|
|
39
40
|
n_parameters=None,
|
|
41
|
+
n_embedding_parameters=None,
|
|
40
42
|
memory_usage_mb=None, # we don't know because they removed the model
|
|
41
43
|
embed_dim=1024,
|
|
42
44
|
license="not specified",
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from mteb.models.model_implementations.arctic_models import (
|
|
2
|
+
ARCTIC_V2_CITATION,
|
|
3
|
+
LANGUAGES_V2_0,
|
|
4
|
+
arctic_v2_training_datasets,
|
|
5
|
+
)
|
|
6
|
+
from mteb.models.model_meta import (
|
|
7
|
+
ModelMeta,
|
|
8
|
+
ScoringFunction,
|
|
9
|
+
)
|
|
10
|
+
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
|
|
11
|
+
|
|
12
|
+
PIXIE_RUNE_V1_CITATION = """@misc{TelePIX-PIXIE-Rune-v1.0,
|
|
13
|
+
title = {PIXIE-Rune-v1.0},
|
|
14
|
+
author = {TelePIX AI Research Team and Bongmin Kim},
|
|
15
|
+
year = {2026},
|
|
16
|
+
howpublished = {Hugging Face model card},
|
|
17
|
+
url = {https://huggingface.co/telepix/PIXIE-Rune-v1.0}
|
|
18
|
+
}"""
|
|
19
|
+
|
|
20
|
+
PIXIE_RUNE_V1_PROMPTS = {
|
|
21
|
+
"query": "query: ",
|
|
22
|
+
"document": "",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# it is further fine-tuned on TelePIX proprietary IR data (not public).
|
|
26
|
+
pixie_rune_v1_training_datasets = set(arctic_v2_training_datasets) | {
|
|
27
|
+
"TelePIX-Proprietary-IR-Triplets",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
pixie_rune_v1_0 = ModelMeta(
|
|
31
|
+
loader=sentence_transformers_loader,
|
|
32
|
+
loader_kwargs={
|
|
33
|
+
"model_prompts": PIXIE_RUNE_V1_PROMPTS,
|
|
34
|
+
},
|
|
35
|
+
name="telepix/PIXIE-Rune-v1.0",
|
|
36
|
+
model_type=["dense"],
|
|
37
|
+
revision="b2486496da71191626666a88f9bfec844933a134",
|
|
38
|
+
release_date="2026-01-15",
|
|
39
|
+
languages=LANGUAGES_V2_0,
|
|
40
|
+
open_weights=True,
|
|
41
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
42
|
+
n_parameters=567754752,
|
|
43
|
+
memory_usage_mb=2166,
|
|
44
|
+
max_tokens=6144,
|
|
45
|
+
embed_dim=1024,
|
|
46
|
+
license="apache-2.0",
|
|
47
|
+
reference="https://huggingface.co/telepix/PIXIE-Rune-v1.0",
|
|
48
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
49
|
+
use_instructions=True,
|
|
50
|
+
adapted_from="Snowflake/snowflake-arctic-embed-l-v2.0",
|
|
51
|
+
superseded_by=None,
|
|
52
|
+
public_training_code=None,
|
|
53
|
+
public_training_data=None,
|
|
54
|
+
training_datasets=pixie_rune_v1_training_datasets,
|
|
55
|
+
citation=PIXIE_RUNE_V1_CITATION + "\n\n" + ARCTIC_V2_CITATION,
|
|
56
|
+
)
|
|
@@ -1,15 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
4
5
|
|
|
5
6
|
import torch
|
|
6
|
-
from torch.utils.data import DataLoader
|
|
7
7
|
|
|
8
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
8
|
from mteb.models.abs_encoder import AbsEncoder
|
|
10
9
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
|
|
14
|
+
from torch.utils.data import DataLoader
|
|
15
|
+
|
|
16
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
18
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
13
19
|
|
|
14
20
|
from .repllama_models import RepLLaMAModel, model_prompts
|
|
15
21
|
|
|
@@ -81,6 +87,7 @@ promptriever_llama2 = ModelMeta(
|
|
|
81
87
|
revision="01c7f73d771dfac7d292323805ebc428287df4f9-30b14e3813c0fa45facfd01a594580c3fe5ecf23", # base-peft revision
|
|
82
88
|
release_date="2024-09-15",
|
|
83
89
|
n_parameters=7_000_000_000,
|
|
90
|
+
n_embedding_parameters=None,
|
|
84
91
|
memory_usage_mb=26703,
|
|
85
92
|
max_tokens=4096,
|
|
86
93
|
embed_dim=4096,
|
|
@@ -90,7 +97,7 @@ promptriever_llama2 = ModelMeta(
|
|
|
90
97
|
),
|
|
91
98
|
reference="https://huggingface.co/samaya-ai/promptriever-llama2-7b-v1",
|
|
92
99
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
93
|
-
framework=["PyTorch", "Tevatron"],
|
|
100
|
+
framework=["PyTorch", "Tevatron", "safetensors"],
|
|
94
101
|
use_instructions=True,
|
|
95
102
|
citation=PROMPTRIEVER_CITATION,
|
|
96
103
|
public_training_code=None,
|
|
@@ -117,13 +124,14 @@ promptriever_llama3 = ModelMeta(
|
|
|
117
124
|
},
|
|
118
125
|
release_date="2024-09-15",
|
|
119
126
|
n_parameters=8_000_000_000,
|
|
127
|
+
n_embedding_parameters=None,
|
|
120
128
|
memory_usage_mb=30518,
|
|
121
129
|
max_tokens=8192,
|
|
122
130
|
embed_dim=4096,
|
|
123
131
|
license="apache-2.0",
|
|
124
132
|
reference="https://huggingface.co/samaya-ai/promptriever-llama3.1-8b-v1",
|
|
125
133
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
126
|
-
framework=["PyTorch", "Tevatron"],
|
|
134
|
+
framework=["PyTorch", "Tevatron", "safetensors"],
|
|
127
135
|
use_instructions=True,
|
|
128
136
|
citation=PROMPTRIEVER_CITATION,
|
|
129
137
|
public_training_code=None,
|
|
@@ -146,6 +154,7 @@ promptriever_llama3_instruct = ModelMeta(
|
|
|
146
154
|
revision="5206a32e0bd3067aef1ce90f5528ade7d866253f-8b677258615625122c2eb7329292b8c402612c21", # base-peft revision
|
|
147
155
|
release_date="2024-09-15",
|
|
148
156
|
n_parameters=8_000_000_000,
|
|
157
|
+
n_embedding_parameters=None,
|
|
149
158
|
memory_usage_mb=30518,
|
|
150
159
|
max_tokens=8192,
|
|
151
160
|
embed_dim=4096,
|
|
@@ -156,7 +165,7 @@ promptriever_llama3_instruct = ModelMeta(
|
|
|
156
165
|
license="apache-2.0",
|
|
157
166
|
reference="https://huggingface.co/samaya-ai/promptriever-llama3.1-8b-instruct-v1",
|
|
158
167
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
159
|
-
framework=["PyTorch", "Tevatron"],
|
|
168
|
+
framework=["PyTorch", "Tevatron", "safetensors"],
|
|
160
169
|
use_instructions=True,
|
|
161
170
|
citation=PROMPTRIEVER_CITATION,
|
|
162
171
|
public_training_code=None,
|
|
@@ -179,6 +188,7 @@ promptriever_mistral_v1 = ModelMeta(
|
|
|
179
188
|
revision="7231864981174d9bee8c7687c24c8344414eae6b-876d63e49b6115ecb6839893a56298fadee7e8f5", # base-peft revision
|
|
180
189
|
release_date="2024-09-15",
|
|
181
190
|
n_parameters=7_000_000_000,
|
|
191
|
+
n_embedding_parameters=131_072_000,
|
|
182
192
|
memory_usage_mb=26703,
|
|
183
193
|
training_datasets={
|
|
184
194
|
# "samaya-ai/msmarco-w-instructions",
|
|
@@ -189,7 +199,7 @@ promptriever_mistral_v1 = ModelMeta(
|
|
|
189
199
|
license="apache-2.0",
|
|
190
200
|
reference="https://huggingface.co/samaya-ai/promptriever-mistral-v0.1-7b-v1",
|
|
191
201
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
192
|
-
framework=["PyTorch", "Tevatron"],
|
|
202
|
+
framework=["PyTorch", "Tevatron", "safetensors"],
|
|
193
203
|
use_instructions=True,
|
|
194
204
|
citation=PROMPTRIEVER_CITATION,
|
|
195
205
|
public_training_code=None,
|
|
@@ -1,35 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import heapq
|
|
2
4
|
import logging
|
|
3
5
|
import shutil
|
|
4
6
|
import tempfile
|
|
5
7
|
from pathlib import Path
|
|
6
|
-
from typing import Any
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
7
9
|
|
|
8
10
|
import torch
|
|
9
|
-
from torch.utils.data import DataLoader
|
|
10
11
|
|
|
11
12
|
from mteb._create_dataloaders import (
|
|
12
13
|
create_dataloader,
|
|
13
14
|
)
|
|
14
15
|
from mteb._requires_package import requires_package
|
|
15
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
16
16
|
from mteb.models.abs_encoder import AbsEncoder
|
|
17
17
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
18
|
-
from mteb.types import
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
18
|
+
from mteb.types import PromptType
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from torch.utils.data import DataLoader
|
|
22
|
+
|
|
23
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
24
|
+
from mteb.types import (
|
|
25
|
+
Array,
|
|
26
|
+
BatchedInput,
|
|
27
|
+
CorpusDatasetType,
|
|
28
|
+
EncodeKwargs,
|
|
29
|
+
QueryDatasetType,
|
|
30
|
+
RetrievalOutputType,
|
|
31
|
+
TopRankedDocumentsType,
|
|
32
|
+
)
|
|
33
|
+
|
|
27
34
|
|
|
28
35
|
logger = logging.getLogger(__name__)
|
|
29
36
|
|
|
30
37
|
|
|
31
38
|
class PylateSearchEncoder:
|
|
32
|
-
"""Mixin class to add PyLate-based indexing and search to an encoder. Implements
|
|
39
|
+
"""Mixin class to add PyLate-based indexing and search to an encoder. Implements [SearchProtocol][mteb.models.SearchProtocol]"""
|
|
33
40
|
|
|
34
41
|
base_index_dir: Path | None = None
|
|
35
42
|
_index_dir: Path | None = None
|
|
@@ -45,7 +52,8 @@ class PylateSearchEncoder:
|
|
|
45
52
|
task_metadata: TaskMetadata,
|
|
46
53
|
hf_split: str,
|
|
47
54
|
hf_subset: str,
|
|
48
|
-
encode_kwargs:
|
|
55
|
+
encode_kwargs: EncodeKwargs,
|
|
56
|
+
num_proc: int,
|
|
49
57
|
) -> None:
|
|
50
58
|
"""Index the corpus for retrieval.
|
|
51
59
|
|
|
@@ -55,6 +63,7 @@ class PylateSearchEncoder:
|
|
|
55
63
|
hf_split: Split of current task, allows to know some additional information about current split.
|
|
56
64
|
hf_subset: Subset of current task. Similar to `hf_split` to get more information
|
|
57
65
|
encode_kwargs: Additional arguments to pass to the encoder during indexing.
|
|
66
|
+
num_proc: Number of processes to use for indexing.
|
|
58
67
|
"""
|
|
59
68
|
self.task_corpus = corpus
|
|
60
69
|
|
|
@@ -78,14 +87,16 @@ class PylateSearchEncoder:
|
|
|
78
87
|
hf_split: str,
|
|
79
88
|
hf_subset: str,
|
|
80
89
|
top_k: int,
|
|
81
|
-
encode_kwargs:
|
|
90
|
+
encode_kwargs: EncodeKwargs,
|
|
82
91
|
top_ranked: TopRankedDocumentsType | None = None,
|
|
92
|
+
num_proc: int,
|
|
83
93
|
) -> RetrievalOutputType:
|
|
84
94
|
queries_dataloader = create_dataloader(
|
|
85
95
|
queries,
|
|
86
96
|
task_metadata,
|
|
87
97
|
prompt_type=PromptType.query,
|
|
88
98
|
batch_size=encode_kwargs.get("batch_size", 32),
|
|
99
|
+
num_proc=num_proc,
|
|
89
100
|
)
|
|
90
101
|
|
|
91
102
|
query_embeddings = self.encode(
|
|
@@ -109,6 +120,7 @@ class PylateSearchEncoder:
|
|
|
109
120
|
hf_subset=hf_subset,
|
|
110
121
|
hf_split=hf_split,
|
|
111
122
|
encode_kwargs=encode_kwargs,
|
|
123
|
+
num_proc=num_proc,
|
|
112
124
|
)
|
|
113
125
|
else:
|
|
114
126
|
result_heaps = self._pylate_full_corpus_search(
|
|
@@ -119,6 +131,7 @@ class PylateSearchEncoder:
|
|
|
119
131
|
hf_subset=hf_subset,
|
|
120
132
|
hf_split=hf_split,
|
|
121
133
|
encode_kwargs=encode_kwargs,
|
|
134
|
+
num_proc=num_proc,
|
|
122
135
|
)
|
|
123
136
|
|
|
124
137
|
results = {qid: {} for qid in query_idx_to_id.values()}
|
|
@@ -136,7 +149,8 @@ class PylateSearchEncoder:
|
|
|
136
149
|
hf_subset: str,
|
|
137
150
|
hf_split: str,
|
|
138
151
|
top_k: int,
|
|
139
|
-
encode_kwargs:
|
|
152
|
+
encode_kwargs: EncodeKwargs,
|
|
153
|
+
num_proc: int,
|
|
140
154
|
) -> dict[str, list[tuple[float, str]]]:
|
|
141
155
|
from pylate import indexes, retrieve
|
|
142
156
|
|
|
@@ -163,6 +177,7 @@ class PylateSearchEncoder:
|
|
|
163
177
|
task_metadata,
|
|
164
178
|
prompt_type=PromptType.document,
|
|
165
179
|
batch_size=encode_kwargs.get("batch_size", 32),
|
|
180
|
+
num_proc=num_proc,
|
|
166
181
|
)
|
|
167
182
|
documents_embeddings = self.encode(
|
|
168
183
|
documents_loader,
|
|
@@ -200,7 +215,8 @@ class PylateSearchEncoder:
|
|
|
200
215
|
task_metadata: TaskMetadata,
|
|
201
216
|
hf_subset: str,
|
|
202
217
|
hf_split: str,
|
|
203
|
-
encode_kwargs:
|
|
218
|
+
encode_kwargs: EncodeKwargs,
|
|
219
|
+
num_proc: int = 1,
|
|
204
220
|
) -> dict[str, list[tuple[float, str]]]:
|
|
205
221
|
"""Rerank with PyLate's rank.rerank using per-query candidates.
|
|
206
222
|
|
|
@@ -223,6 +239,7 @@ class PylateSearchEncoder:
|
|
|
223
239
|
task_metadata,
|
|
224
240
|
prompt_type=PromptType.document,
|
|
225
241
|
batch_size=encode_kwargs.get("batch_size", 32),
|
|
242
|
+
num_proc=num_proc,
|
|
226
243
|
),
|
|
227
244
|
task_metadata=task_metadata,
|
|
228
245
|
hf_split=hf_split,
|
|
@@ -345,12 +362,13 @@ colbert_v2 = ModelMeta(
|
|
|
345
362
|
public_training_data=None,
|
|
346
363
|
release_date="2024-09-21",
|
|
347
364
|
n_parameters=int(110 * 1e6),
|
|
365
|
+
n_embedding_parameters=23_440_896,
|
|
348
366
|
memory_usage_mb=418,
|
|
349
367
|
max_tokens=180,
|
|
350
368
|
embed_dim=None,
|
|
351
369
|
license="mit",
|
|
352
370
|
similarity_fn_name=ScoringFunction.MAX_SIM,
|
|
353
|
-
framework=["PyLate", "ColBERT"],
|
|
371
|
+
framework=["PyLate", "ColBERT", "Transformers", "ONNX", "safetensors"],
|
|
354
372
|
reference="https://huggingface.co/colbert-ir/colbertv2.0",
|
|
355
373
|
use_instructions=False,
|
|
356
374
|
adapted_from=None,
|
|
@@ -401,12 +419,13 @@ jina_colbert_v2 = ModelMeta(
|
|
|
401
419
|
public_training_data=None,
|
|
402
420
|
release_date="2024-08-16",
|
|
403
421
|
n_parameters=int(559 * 1e6),
|
|
422
|
+
n_embedding_parameters=None,
|
|
404
423
|
memory_usage_mb=1067,
|
|
405
424
|
max_tokens=8192,
|
|
406
425
|
embed_dim=None,
|
|
407
426
|
license="cc-by-nc-4.0",
|
|
408
427
|
similarity_fn_name=ScoringFunction.MAX_SIM,
|
|
409
|
-
framework=["PyLate", "ColBERT"],
|
|
428
|
+
framework=["PyLate", "ColBERT", "ONNX", "safetensors"],
|
|
410
429
|
reference="https://huggingface.co/jinaai/jina-colbert-v2",
|
|
411
430
|
use_instructions=False,
|
|
412
431
|
adapted_from=None,
|
|
@@ -439,7 +458,7 @@ jina_colbert_v2 = ModelMeta(
|
|
|
439
458
|
url = "https://aclanthology.org/2024.mrl-1.11/",
|
|
440
459
|
doi = "10.18653/v1/2024.mrl-1.11",
|
|
441
460
|
pages = "159--166",
|
|
442
|
-
abstract = "Multi-vector dense models, such as ColBERT, have proven highly effective in information retrieval. ColBERT`s late interaction scoring approximates the joint query-document attention seen in cross-encoders while maintaining inference efficiency closer to traditional dense retrieval models, thanks to its bi-encoder architecture and recent optimizations in indexing and search. In this paper, we introduce a novel architecture and a training framework to support long context window and multilingual retrieval. Leveraging Matryoshka Representation Loss, we further demonstrate that the reducing the embedding dimensionality from 128 to 64 has insignificant impact on the model`s retrieval performance and cut storage requirements by up to 50{
|
|
461
|
+
abstract = "Multi-vector dense models, such as ColBERT, have proven highly effective in information retrieval. ColBERT`s late interaction scoring approximates the joint query-document attention seen in cross-encoders while maintaining inference efficiency closer to traditional dense retrieval models, thanks to its bi-encoder architecture and recent optimizations in indexing and search. In this paper, we introduce a novel architecture and a training framework to support long context window and multilingual retrieval. Leveraging Matryoshka Representation Loss, we further demonstrate that the reducing the embedding dimensionality from 128 to 64 has insignificant impact on the model`s retrieval performance and cut storage requirements by up to 50{\\%}. Our new model, Jina-ColBERT-v2, demonstrates strong performance across a range of English and multilingual retrieval tasks,"
|
|
443
462
|
}""",
|
|
444
463
|
)
|
|
445
464
|
|
|
@@ -457,12 +476,13 @@ lightonai__gte_moderncolbert_v1 = ModelMeta(
|
|
|
457
476
|
public_training_data="https://huggingface.co/datasets/lightonai/ms-marco-en-bge-gemma",
|
|
458
477
|
release_date="2025-04-30",
|
|
459
478
|
n_parameters=int(149 * 1e6),
|
|
479
|
+
n_embedding_parameters=None,
|
|
460
480
|
memory_usage_mb=None,
|
|
461
481
|
max_tokens=8192,
|
|
462
482
|
embed_dim=None,
|
|
463
483
|
license="apache-2.0",
|
|
464
484
|
similarity_fn_name="MaxSim",
|
|
465
|
-
framework=["PyLate", "ColBERT"],
|
|
485
|
+
framework=["PyLate", "ColBERT", "safetensors", "Sentence Transformers"],
|
|
466
486
|
reference="https://huggingface.co/lightonai/GTE-ModernColBERT-v1",
|
|
467
487
|
use_instructions=False,
|
|
468
488
|
adapted_from="Alibaba-NLP/gte-modernbert-base",
|
|
@@ -36,13 +36,14 @@ Qodo_Embed_1_1_5B = ModelMeta(
|
|
|
36
36
|
revision="84bbef079b32e8823ec226d4e9e92902706b0eb6",
|
|
37
37
|
release_date="2025-02-19",
|
|
38
38
|
n_parameters=1_780_000_000,
|
|
39
|
+
n_embedding_parameters=232_928_256,
|
|
39
40
|
memory_usage_mb=6776,
|
|
40
41
|
embed_dim=1536,
|
|
41
42
|
license="https://huggingface.co/Qodo/Qodo-Embed-1-1.5B/blob/main/LICENSE",
|
|
42
43
|
max_tokens=32768,
|
|
43
44
|
reference="https://huggingface.co/Qodo/Qodo-Embed-1-1.5B",
|
|
44
45
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
45
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
46
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
46
47
|
use_instructions=False,
|
|
47
48
|
public_training_code=None,
|
|
48
49
|
public_training_data=None,
|
|
@@ -59,13 +60,14 @@ Qodo_Embed_1_7B = ModelMeta(
|
|
|
59
60
|
revision="f9edd9bf7f687c0e832424058e265120f603cd81",
|
|
60
61
|
release_date="2025-02-24",
|
|
61
62
|
n_parameters=7_613_000_000,
|
|
63
|
+
n_embedding_parameters=None,
|
|
62
64
|
memory_usage_mb=29040,
|
|
63
65
|
embed_dim=3584,
|
|
64
66
|
license="https://huggingface.co/Qodo/Qodo-Embed-1-1.5B/blob/main/LICENSE",
|
|
65
67
|
max_tokens=32768,
|
|
66
68
|
reference="https://huggingface.co/Qodo/Qodo-Embed-1-7B",
|
|
67
69
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
68
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
70
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
69
71
|
use_instructions=False,
|
|
70
72
|
public_training_code=None,
|
|
71
73
|
public_training_data=None,
|
|
@@ -31,13 +31,14 @@ mini_gte = ModelMeta(
|
|
|
31
31
|
revision="7fbe6f9b4cc42615e0747299f837ad7769025492",
|
|
32
32
|
release_date="2025-01-28",
|
|
33
33
|
n_parameters=int(66.3 * 1e6),
|
|
34
|
+
n_embedding_parameters=23_440_896,
|
|
34
35
|
memory_usage_mb=253,
|
|
35
36
|
embed_dim=768,
|
|
36
37
|
license="apache-2.0",
|
|
37
38
|
max_tokens=512,
|
|
38
39
|
reference="https://huggingface.co/prdev/mini-gte",
|
|
39
40
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
40
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
41
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
41
42
|
use_instructions=False,
|
|
42
43
|
public_training_code=None,
|
|
43
44
|
public_training_data=None,
|
|
@@ -1,6 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
1
5
|
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
2
6
|
from mteb.models.model_meta import ModelMeta
|
|
3
|
-
from mteb.
|
|
7
|
+
from mteb.types import PromptType
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
4
11
|
|
|
5
12
|
|
|
6
13
|
def instruction_template(
|
|
@@ -140,13 +147,14 @@ Qwen3_Embedding_0B6 = ModelMeta(
|
|
|
140
147
|
revision="b22da495047858cce924d27d76261e96be6febc0", # Commit of @tomaarsen
|
|
141
148
|
release_date="2025-06-05",
|
|
142
149
|
n_parameters=595776512,
|
|
150
|
+
n_embedding_parameters=None,
|
|
143
151
|
memory_usage_mb=1136,
|
|
144
152
|
embed_dim=1024,
|
|
145
153
|
max_tokens=32768,
|
|
146
154
|
license="apache-2.0",
|
|
147
155
|
reference="https://huggingface.co/Qwen/Qwen3-Embedding-0.6B",
|
|
148
156
|
similarity_fn_name="cosine",
|
|
149
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
157
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
150
158
|
use_instructions=True,
|
|
151
159
|
public_training_code=None,
|
|
152
160
|
public_training_data=None,
|
|
@@ -163,13 +171,14 @@ Qwen3_Embedding_4B = ModelMeta(
|
|
|
163
171
|
revision="636cd9bf47d976946cdbb2b0c3ca0cb2f8eea5ff", # Commit of @tomaarsen
|
|
164
172
|
release_date="2025-06-05",
|
|
165
173
|
n_parameters=4021774336,
|
|
174
|
+
n_embedding_parameters=None,
|
|
166
175
|
memory_usage_mb=7671,
|
|
167
176
|
embed_dim=2560,
|
|
168
177
|
max_tokens=32768,
|
|
169
178
|
license="apache-2.0",
|
|
170
179
|
reference="https://huggingface.co/Qwen/Qwen3-Embedding-4B",
|
|
171
180
|
similarity_fn_name="cosine",
|
|
172
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
181
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
173
182
|
use_instructions=True,
|
|
174
183
|
public_training_code=None,
|
|
175
184
|
public_training_data=None,
|
|
@@ -186,13 +195,14 @@ Qwen3_Embedding_8B = ModelMeta(
|
|
|
186
195
|
revision="4e423935c619ae4df87b646a3ce949610c66241c", # Commit of @tomaarsen
|
|
187
196
|
release_date="2025-06-05",
|
|
188
197
|
n_parameters=7567295488,
|
|
198
|
+
n_embedding_parameters=None,
|
|
189
199
|
memory_usage_mb=14433,
|
|
190
200
|
embed_dim=4096,
|
|
191
201
|
max_tokens=32768,
|
|
192
202
|
license="apache-2.0",
|
|
193
203
|
reference="https://huggingface.co/Qwen/Qwen3-Embedding-8B",
|
|
194
204
|
similarity_fn_name="cosine",
|
|
195
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
205
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
196
206
|
use_instructions=True,
|
|
197
207
|
public_training_code=None,
|
|
198
208
|
public_training_data=None,
|