mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +78 -30
- mteb/_evaluators/any_sts_evaluator.py +13 -6
- mteb/_evaluators/clustering_evaluator.py +13 -5
- mteb/_evaluators/evaluator.py +12 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
- mteb/_evaluators/pair_classification_evaluator.py +17 -7
- mteb/_evaluators/retrieval_evaluator.py +23 -14
- mteb/_evaluators/retrieval_metrics.py +26 -19
- mteb/_evaluators/sklearn_evaluator.py +27 -17
- mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
- mteb/_evaluators/text/summarization_evaluator.py +31 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +9 -3
- mteb/abstasks/_data_filter/task_pipelines.py +10 -2
- mteb/abstasks/_statistics_calculation.py +21 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +78 -44
- mteb/abstasks/aggregate_task_metadata.py +21 -18
- mteb/abstasks/aggregated_task.py +23 -35
- mteb/abstasks/classification.py +39 -18
- mteb/abstasks/clustering.py +37 -20
- mteb/abstasks/clustering_legacy.py +30 -16
- mteb/abstasks/image/image_text_pair_classification.py +26 -9
- mteb/abstasks/multilabel_classification.py +33 -21
- mteb/abstasks/pair_classification.py +44 -19
- mteb/abstasks/regression.py +18 -10
- mteb/abstasks/retrieval.py +82 -52
- mteb/abstasks/retrieval_dataset_loaders.py +50 -39
- mteb/abstasks/sts.py +34 -15
- mteb/abstasks/task_metadata.py +44 -37
- mteb/abstasks/text/bitext_mining.py +57 -35
- mteb/abstasks/text/reranking.py +10 -8
- mteb/abstasks/text/summarization.py +26 -10
- mteb/abstasks/zeroshot_classification.py +27 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +15 -3
- mteb/benchmarks/benchmarks/__init__.py +6 -0
- mteb/benchmarks/benchmarks/benchmarks.py +153 -13
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +189 -31
- mteb/cli/_display_tasks.py +10 -4
- mteb/cli/build_cli.py +112 -13
- mteb/cli/generate_model_card.py +50 -23
- mteb/deprecated_evaluator.py +72 -54
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +71 -47
- mteb/filter_tasks.py +36 -32
- mteb/get_tasks.py +37 -33
- mteb/languages/language_scripts.py +11 -4
- mteb/leaderboard/app.py +172 -37
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +20 -14
- mteb/models/abs_encoder.py +30 -16
- mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +16 -11
- mteb/models/get_model_meta.py +53 -9
- mteb/models/instruct_wrapper.py +41 -13
- mteb/models/model_implementations/align_models.py +11 -5
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +6 -4
- mteb/models/model_implementations/ara_models.py +2 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +85 -22
- mteb/models/model_implementations/bica_model.py +4 -3
- mteb/models/model_implementations/blip2_models.py +13 -6
- mteb/models/model_implementations/blip_models.py +33 -20
- mteb/models/model_implementations/bm25.py +27 -17
- mteb/models/model_implementations/bmretriever_models.py +16 -6
- mteb/models/model_implementations/cadet_models.py +2 -1
- mteb/models/model_implementations/cde_models.py +22 -9
- mteb/models/model_implementations/clip_models.py +18 -10
- mteb/models/model_implementations/clips_models.py +6 -3
- mteb/models/model_implementations/codefuse_models.py +10 -5
- mteb/models/model_implementations/codesage_models.py +6 -3
- mteb/models/model_implementations/cohere_models.py +19 -9
- mteb/models/model_implementations/cohere_v.py +16 -6
- mteb/models/model_implementations/colpali_models.py +10 -6
- mteb/models/model_implementations/colqwen_models.py +24 -38
- mteb/models/model_implementations/colsmol_models.py +5 -3
- mteb/models/model_implementations/conan_models.py +12 -5
- mteb/models/model_implementations/dino_models.py +70 -46
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +18 -9
- mteb/models/model_implementations/e5_v.py +16 -10
- mteb/models/model_implementations/eagerworks_models.py +12 -5
- mteb/models/model_implementations/emillykkejensen_models.py +9 -6
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +3 -2
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +18 -9
- mteb/models/model_implementations/facebookai.py +16 -2
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +13 -8
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
- mteb/models/model_implementations/gritlm_models.py +5 -2
- mteb/models/model_implementations/gte_models.py +34 -13
- mteb/models/model_implementations/hinvec_models.py +7 -2
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +16 -7
- mteb/models/model_implementations/jina_clip.py +58 -14
- mteb/models/model_implementations/jina_models.py +35 -16
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +13 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
- mteb/models/model_implementations/kfst.py +2 -1
- mteb/models/model_implementations/kowshik24_models.py +2 -1
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +8 -2
- mteb/models/model_implementations/listconranker.py +11 -5
- mteb/models/model_implementations/llm2clip_models.py +18 -10
- mteb/models/model_implementations/llm2vec_models.py +28 -14
- mteb/models/model_implementations/mcinext_models.py +12 -3
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +131 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
- mteb/models/model_implementations/mme5_models.py +3 -2
- mteb/models/model_implementations/moco_models.py +15 -8
- mteb/models/model_implementations/mod_models.py +3 -2
- mteb/models/model_implementations/model2vec_models.py +37 -18
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +6 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
- mteb/models/model_implementations/nomic_models.py +47 -19
- mteb/models/model_implementations/nomic_models_vision.py +6 -4
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
- mteb/models/model_implementations/nvidia_models.py +165 -22
- mteb/models/model_implementations/octen_models.py +64 -3
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +30 -17
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
- mteb/models/model_implementations/ops_moa_models.py +10 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
- mteb/models/model_implementations/pawan_models.py +2 -1
- mteb/models/model_implementations/piccolo_models.py +3 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +20 -10
- mteb/models/model_implementations/pylate_models.py +41 -21
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +14 -4
- mteb/models/model_implementations/qzhou_models.py +4 -2
- mteb/models/model_implementations/random_baseline.py +7 -6
- mteb/models/model_implementations/rasgaard_models.py +3 -2
- mteb/models/model_implementations/reasonir_model.py +66 -1
- mteb/models/model_implementations/repllama_models.py +18 -9
- mteb/models/model_implementations/rerankers_custom.py +25 -10
- mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +40 -20
- mteb/models/model_implementations/ruri_models.py +20 -10
- mteb/models/model_implementations/salesforce_models.py +13 -4
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +142 -22
- mteb/models/model_implementations/shuu_model.py +2 -1
- mteb/models/model_implementations/siglip_models.py +39 -24
- mteb/models/model_implementations/slm_models.py +419 -0
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +4 -2
- mteb/models/model_implementations/text2vec_models.py +12 -3
- mteb/models/model_implementations/ua_sentence_models.py +2 -1
- mteb/models/model_implementations/uae_models.py +17 -5
- mteb/models/model_implementations/vdr_models.py +9 -2
- mteb/models/model_implementations/vi_vn_models.py +12 -6
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +14 -7
- mteb/models/model_implementations/voyage_models.py +136 -4
- mteb/models/model_implementations/voyage_v.py +17 -10
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +2 -1
- mteb/models/model_implementations/yuan_models_en.py +3 -2
- mteb/models/model_meta.py +127 -40
- mteb/models/models_protocols.py +43 -22
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
- mteb/models/search_wrappers.py +63 -29
- mteb/models/sentence_transformer_wrapper.py +52 -26
- mteb/models/vllm_wrapper.py +329 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +48 -35
- mteb/results/model_result.py +68 -32
- mteb/results/task_result.py +110 -72
- mteb/similarity_functions.py +19 -9
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +2 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +16 -16
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +44 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +3 -3
- mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +13 -1
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +18 -5
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,20 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
|
-
from torch.utils.data import DataLoader
|
|
5
6
|
from tqdm.auto import tqdm
|
|
6
7
|
|
|
7
8
|
from mteb._requires_package import requires_image_dependencies
|
|
8
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
9
|
from mteb.models.abs_encoder import AbsEncoder
|
|
10
|
+
from mteb.models.model_implementations.colpali_models import COLPALI_TRAINING_DATA
|
|
10
11
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
|
-
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from torch.utils.data import DataLoader
|
|
15
|
+
|
|
16
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
12
18
|
|
|
13
19
|
JINA_CLIP_CITATION = """@article{koukounas2024jinaclip,
|
|
14
20
|
title={Jina CLIP: Your CLIP Model Is Also Your Text Retriever},
|
|
@@ -120,8 +126,17 @@ class JinaCLIPModel(AbsEncoder):
|
|
|
120
126
|
raise ValueError
|
|
121
127
|
|
|
122
128
|
|
|
129
|
+
_JINA_CLIP_TRAIN_DATASETS_V1 = {
|
|
130
|
+
# LAION400M
|
|
131
|
+
# ShareGPT4V
|
|
132
|
+
"MSMARCO",
|
|
133
|
+
"NQ",
|
|
134
|
+
"HotpotQA",
|
|
135
|
+
# Natural Language Inference (NLI) dataset (Bowman et al., 2015)
|
|
136
|
+
}
|
|
137
|
+
|
|
123
138
|
jina_clip_v1 = ModelMeta(
|
|
124
|
-
loader=JinaCLIPModel,
|
|
139
|
+
loader=JinaCLIPModel,
|
|
125
140
|
name="jinaai/jina-clip-v1",
|
|
126
141
|
model_type=["dense"],
|
|
127
142
|
languages=["eng-Latn"],
|
|
@@ -129,6 +144,7 @@ jina_clip_v1 = ModelMeta(
|
|
|
129
144
|
release_date="2024-05-30",
|
|
130
145
|
modalities=["image", "text"],
|
|
131
146
|
n_parameters=223_000_000,
|
|
147
|
+
n_embedding_parameters=None,
|
|
132
148
|
memory_usage_mb=849,
|
|
133
149
|
max_tokens=8192,
|
|
134
150
|
embed_dim=768,
|
|
@@ -136,17 +152,45 @@ jina_clip_v1 = ModelMeta(
|
|
|
136
152
|
open_weights=True,
|
|
137
153
|
public_training_code=None,
|
|
138
154
|
public_training_data=None,
|
|
139
|
-
framework=["PyTorch"],
|
|
155
|
+
framework=["PyTorch", "Transformers", "ONNX", "safetensors"],
|
|
140
156
|
reference="https://huggingface.co/jinaai/jina-clip-v1",
|
|
141
157
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
142
158
|
use_instructions=True,
|
|
143
|
-
training_datasets=
|
|
144
|
-
# LAION400M
|
|
145
|
-
# ShareGPT4V
|
|
146
|
-
"MSMARCO",
|
|
147
|
-
# NQ
|
|
148
|
-
# HotpotQA
|
|
149
|
-
# Natural Language Inference (NLI) dataset (Bowman et al., 2015)
|
|
150
|
-
},
|
|
159
|
+
training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1,
|
|
151
160
|
citation=JINA_CLIP_CITATION,
|
|
161
|
+
superseded_by="jinaai/jina-clip-v2",
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
jina_clip_v2 = ModelMeta(
|
|
165
|
+
loader=JinaCLIPModel,
|
|
166
|
+
name="jinaai/jina-clip-v2",
|
|
167
|
+
revision="344d954da76eb8ad47a7aaff42d012e30c15b8fe",
|
|
168
|
+
release_date="2024-10-09",
|
|
169
|
+
languages=["eng-Latn"],
|
|
170
|
+
n_parameters=865278477,
|
|
171
|
+
memory_usage_mb=1650.0,
|
|
172
|
+
max_tokens=8192,
|
|
173
|
+
embed_dim=1024,
|
|
174
|
+
license="cc-by-nc-4.0",
|
|
175
|
+
open_weights=True,
|
|
176
|
+
public_training_code=None,
|
|
177
|
+
public_training_data=None,
|
|
178
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
179
|
+
reference="https://huggingface.co/jinaai/jina-clip-v2",
|
|
180
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
181
|
+
use_instructions=False,
|
|
182
|
+
training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1 | COLPALI_TRAINING_DATA,
|
|
183
|
+
modalities=["text", "image"],
|
|
184
|
+
model_type=["dense"],
|
|
185
|
+
citation="""
|
|
186
|
+
@misc{koukounas2024jinaclipv2multilingualmultimodalembeddings,
|
|
187
|
+
title={jina-clip-v2: Multilingual Multimodal Embeddings for Text and Images},
|
|
188
|
+
author={Andreas Koukounas and Georgios Mastrapas and Bo Wang and Mohammad Kalim Akram and Sedigheh Eslami and Michael Günther and Isabelle Mohr and Saba Sturua and Scott Martens and Nan Wang and Han Xiao},
|
|
189
|
+
year={2024},
|
|
190
|
+
eprint={2412.08802},
|
|
191
|
+
archivePrefix={arXiv},
|
|
192
|
+
primaryClass={cs.CL},
|
|
193
|
+
url={https://arxiv.org/abs/2412.08802},
|
|
194
|
+
}
|
|
195
|
+
""",
|
|
152
196
|
)
|
|
@@ -1,14 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
from collections import defaultdict
|
|
3
|
-
from typing import Any, ClassVar
|
|
5
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
import torch
|
|
7
|
-
from sentence_transformers import CrossEncoder
|
|
8
|
-
from torch.utils.data import DataLoader
|
|
9
9
|
|
|
10
10
|
from mteb._requires_package import requires_package
|
|
11
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
12
11
|
from mteb.languages import PROGRAMMING_LANGS
|
|
13
12
|
from mteb.models.abs_encoder import AbsEncoder
|
|
14
13
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
@@ -16,7 +15,13 @@ from mteb.models.sentence_transformer_wrapper import (
|
|
|
16
15
|
CrossEncoderWrapper,
|
|
17
16
|
SentenceTransformerEncoderWrapper,
|
|
18
17
|
)
|
|
19
|
-
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from sentence_transformers import CrossEncoder
|
|
21
|
+
from torch.utils.data import DataLoader
|
|
22
|
+
|
|
23
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
24
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
20
25
|
|
|
21
26
|
logger = logging.getLogger(__name__)
|
|
22
27
|
|
|
@@ -257,6 +262,7 @@ class JinaRerankerV3Wrapper(CrossEncoderWrapper):
|
|
|
257
262
|
self,
|
|
258
263
|
model: CrossEncoder | str,
|
|
259
264
|
revision: str | None = None,
|
|
265
|
+
device: str | None = None,
|
|
260
266
|
trust_remote_code: bool = True,
|
|
261
267
|
**kwargs: Any,
|
|
262
268
|
) -> None:
|
|
@@ -267,10 +273,7 @@ class JinaRerankerV3Wrapper(CrossEncoderWrapper):
|
|
|
267
273
|
model, trust_remote_code=trust_remote_code, dtype="auto"
|
|
268
274
|
)
|
|
269
275
|
|
|
270
|
-
device =
|
|
271
|
-
if device is None:
|
|
272
|
-
device = get_device_name()
|
|
273
|
-
logger.info(f"Use pytorch device: {device}")
|
|
276
|
+
device = device or get_device_name()
|
|
274
277
|
|
|
275
278
|
self.model.to(device)
|
|
276
279
|
self.model.eval()
|
|
@@ -320,6 +323,7 @@ class JinaWrapper(SentenceTransformerEncoderWrapper):
|
|
|
320
323
|
self,
|
|
321
324
|
model: str,
|
|
322
325
|
revision: str,
|
|
326
|
+
device: str | None = None,
|
|
323
327
|
model_prompts: dict[str, str] | None = None,
|
|
324
328
|
**kwargs,
|
|
325
329
|
) -> None:
|
|
@@ -339,7 +343,9 @@ class JinaWrapper(SentenceTransformerEncoderWrapper):
|
|
|
339
343
|
)
|
|
340
344
|
import flash_attn # noqa: F401
|
|
341
345
|
|
|
342
|
-
super().__init__(
|
|
346
|
+
super().__init__(
|
|
347
|
+
model, revision, device=device, model_prompts=model_prompts, **kwargs
|
|
348
|
+
)
|
|
343
349
|
|
|
344
350
|
def encode(
|
|
345
351
|
self,
|
|
@@ -727,12 +733,13 @@ jina_reranker_v3 = ModelMeta(
|
|
|
727
733
|
release_date="2025-09-18", # official release date
|
|
728
734
|
modalities=["text"],
|
|
729
735
|
n_parameters=int(0.6 * 1e9),
|
|
736
|
+
n_embedding_parameters=None,
|
|
730
737
|
memory_usage_mb=1138,
|
|
731
738
|
max_tokens=131072,
|
|
732
739
|
embed_dim=None,
|
|
733
740
|
license="cc-by-nc-4.0",
|
|
734
741
|
similarity_fn_name=None,
|
|
735
|
-
framework=["PyTorch"],
|
|
742
|
+
framework=["PyTorch", "Transformers", "safetensors"],
|
|
736
743
|
use_instructions=None,
|
|
737
744
|
reference="https://huggingface.co/jinaai/jina-reranker-v3",
|
|
738
745
|
public_training_code=None,
|
|
@@ -770,12 +777,13 @@ jina_embeddings_v4 = ModelMeta(
|
|
|
770
777
|
release_date="2025-06-24", # official release date
|
|
771
778
|
modalities=["image", "text"],
|
|
772
779
|
n_parameters=int(3.8 * 1e9),
|
|
780
|
+
n_embedding_parameters=None,
|
|
773
781
|
memory_usage_mb=7500,
|
|
774
782
|
max_tokens=32768,
|
|
775
783
|
embed_dim=2048,
|
|
776
784
|
license="cc-by-nc-4.0",
|
|
777
785
|
similarity_fn_name="cosine",
|
|
778
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
786
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
|
|
779
787
|
use_instructions=True,
|
|
780
788
|
reference="https://huggingface.co/jinaai/jina-embeddings-v4",
|
|
781
789
|
public_training_code=None,
|
|
@@ -795,7 +803,7 @@ jina_embeddings_v4 = ModelMeta(
|
|
|
795
803
|
|
|
796
804
|
|
|
797
805
|
jina_embeddings_v3 = ModelMeta(
|
|
798
|
-
loader=JinaWrapper,
|
|
806
|
+
loader=JinaWrapper,
|
|
799
807
|
loader_kwargs=dict(
|
|
800
808
|
trust_remote_code=True,
|
|
801
809
|
model_prompts={
|
|
@@ -818,12 +826,19 @@ jina_embeddings_v3 = ModelMeta(
|
|
|
818
826
|
revision="215a6e121fa0183376388ac6b1ae230326bfeaed",
|
|
819
827
|
release_date="2024-09-18", # official release date
|
|
820
828
|
n_parameters=int(572 * 1e6),
|
|
829
|
+
n_embedding_parameters=None,
|
|
821
830
|
memory_usage_mb=1092,
|
|
822
831
|
max_tokens=8194,
|
|
823
832
|
embed_dim=1024,
|
|
824
833
|
license="cc-by-nc-4.0",
|
|
825
834
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
826
|
-
framework=[
|
|
835
|
+
framework=[
|
|
836
|
+
"Sentence Transformers",
|
|
837
|
+
"PyTorch",
|
|
838
|
+
"Transformers",
|
|
839
|
+
"ONNX",
|
|
840
|
+
"safetensors",
|
|
841
|
+
],
|
|
827
842
|
use_instructions=True,
|
|
828
843
|
reference="https://huggingface.co/jinaai/jina-embeddings-v3",
|
|
829
844
|
public_training_code=None,
|
|
@@ -872,13 +887,14 @@ jina_embeddings_v2_base_en = ModelMeta(
|
|
|
872
887
|
revision="6e85f575bc273f1fd840a658067d0157933c83f0",
|
|
873
888
|
release_date="2023-09-27",
|
|
874
889
|
n_parameters=137_000_000,
|
|
890
|
+
n_embedding_parameters=23_445_504,
|
|
875
891
|
memory_usage_mb=262,
|
|
876
892
|
embed_dim=768,
|
|
877
893
|
license="apache-2.0",
|
|
878
894
|
max_tokens=8192,
|
|
879
895
|
reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en",
|
|
880
896
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
881
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
897
|
+
framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
|
|
882
898
|
use_instructions=False,
|
|
883
899
|
superseded_by=None,
|
|
884
900
|
adapted_from="jina-bert-base-en-v1", # pretrained on C4 with Alibi to support longer context.
|
|
@@ -936,13 +952,14 @@ jina_embeddings_v2_small_en = ModelMeta(
|
|
|
936
952
|
revision="44e7d1d6caec8c883c2d4b207588504d519788d0",
|
|
937
953
|
release_date="2023-09-27",
|
|
938
954
|
n_parameters=32_700_000,
|
|
955
|
+
n_embedding_parameters=15_630_336,
|
|
939
956
|
memory_usage_mb=62,
|
|
940
957
|
embed_dim=512,
|
|
941
958
|
license="apache-2.0",
|
|
942
959
|
max_tokens=8192,
|
|
943
960
|
reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en",
|
|
944
961
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
945
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
962
|
+
framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
|
|
946
963
|
use_instructions=False,
|
|
947
964
|
superseded_by=None,
|
|
948
965
|
adapted_from="jina-bert-smalll-en-v1", # pretrained on C4 with Alibi to support longer context
|
|
@@ -997,6 +1014,7 @@ jina_embedding_b_en_v1 = ModelMeta(
|
|
|
997
1014
|
revision="32aa658e5ceb90793454d22a57d8e3a14e699516",
|
|
998
1015
|
release_date="2023-07-07",
|
|
999
1016
|
n_parameters=110_000_000,
|
|
1017
|
+
n_embedding_parameters=24_674_304,
|
|
1000
1018
|
memory_usage_mb=420,
|
|
1001
1019
|
embed_dim=768,
|
|
1002
1020
|
license="apache-2.0",
|
|
@@ -1054,6 +1072,7 @@ jina_embedding_s_en_v1 = ModelMeta(
|
|
|
1054
1072
|
revision="5ac6cd473e2324c6d5f9e558a6a9f65abb57143e",
|
|
1055
1073
|
release_date="2023-07-07",
|
|
1056
1074
|
n_parameters=35_000_000,
|
|
1075
|
+
n_embedding_parameters=16_449_536,
|
|
1057
1076
|
memory_usage_mb=134,
|
|
1058
1077
|
embed_dim=512,
|
|
1059
1078
|
license="apache-2.0",
|
|
@@ -1,14 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from torch.utils.data import DataLoader
|
|
6
7
|
|
|
7
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
8
8
|
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
9
9
|
from mteb.models.model_meta import ModelMeta
|
|
10
10
|
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
|
|
11
|
-
from mteb.types import
|
|
11
|
+
from mteb.types import PromptType
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from torch.utils.data import DataLoader
|
|
15
|
+
|
|
16
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
|
+
from mteb.types import Array, BatchedInput
|
|
12
18
|
|
|
13
19
|
logger = logging.getLogger(__name__)
|
|
14
20
|
|
|
@@ -774,6 +780,7 @@ HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v1 = ModelMeta(
|
|
|
774
780
|
release_date="2024-10-23",
|
|
775
781
|
languages=["eng-Latn", "zho-Hans"],
|
|
776
782
|
n_parameters=494032768,
|
|
783
|
+
n_embedding_parameters=136_134_656,
|
|
777
784
|
memory_usage_mb=1885,
|
|
778
785
|
max_tokens=512,
|
|
779
786
|
embed_dim=896,
|
|
@@ -799,6 +806,7 @@ HIT_TMG__KaLM_embedding_multilingual_mini_v1 = ModelMeta(
|
|
|
799
806
|
release_date="2024-08-27",
|
|
800
807
|
languages=["eng-Latn", "zho-Hans"],
|
|
801
808
|
n_parameters=494032768,
|
|
809
|
+
n_embedding_parameters=136_134_656,
|
|
802
810
|
memory_usage_mb=1885,
|
|
803
811
|
max_tokens=512,
|
|
804
812
|
embed_dim=896,
|
|
@@ -830,6 +838,7 @@ HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v1_5 = ModelMeta(
|
|
|
830
838
|
release_date="2024-12-26",
|
|
831
839
|
languages=["eng-Latn", "zho-Hans"],
|
|
832
840
|
n_parameters=494032768,
|
|
841
|
+
n_embedding_parameters=136_134_656,
|
|
833
842
|
memory_usage_mb=1885,
|
|
834
843
|
max_tokens=512,
|
|
835
844
|
embed_dim=896,
|
|
@@ -861,6 +870,7 @@ HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v2 = ModelMeta(
|
|
|
861
870
|
release_date="2025-06-25",
|
|
862
871
|
languages=["eng-Latn", "zho-Hans"],
|
|
863
872
|
n_parameters=494032768,
|
|
873
|
+
n_embedding_parameters=136_134_656,
|
|
864
874
|
memory_usage_mb=942,
|
|
865
875
|
max_tokens=512,
|
|
866
876
|
embed_dim=896,
|
|
@@ -892,6 +902,7 @@ KaLM_Embedding_KaLM_embedding_multilingual_mini_instruct_v2_5 = ModelMeta(
|
|
|
892
902
|
release_date="2025-09-30",
|
|
893
903
|
languages=["eng-Latn", "zho-Hans"],
|
|
894
904
|
n_parameters=494032768,
|
|
905
|
+
n_embedding_parameters=136_134_656,
|
|
895
906
|
memory_usage_mb=1885,
|
|
896
907
|
max_tokens=512,
|
|
897
908
|
embed_dim=896,
|
|
@@ -907,23 +918,23 @@ KaLM_Embedding_KaLM_embedding_multilingual_mini_instruct_v2_5 = ModelMeta(
|
|
|
907
918
|
adapted_from="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v2",
|
|
908
919
|
superseded_by=None,
|
|
909
920
|
citation="""@misc{zhao2025kalmembeddingv2,
|
|
910
|
-
title={KaLM-Embedding-V2: Superior Training Techniques and Data Inspire A Versatile Embedding Model},
|
|
921
|
+
title={KaLM-Embedding-V2: Superior Training Techniques and Data Inspire A Versatile Embedding Model},
|
|
911
922
|
author={Xinping Zhao and Xinshuo Hu and Zifei Shan and Shouzheng Huang and Yao Zhou and Xin Zhang and Zetian Sun and Zhenyu Liu and Dongfang Li and Xinyuan Wei and Youcheng Pan and Yang Xiang and Meishan Zhang and Haofen Wang and Jun Yu and Baotian Hu and Min Zhang},
|
|
912
923
|
year={2025},
|
|
913
924
|
eprint={2506.20923},
|
|
914
925
|
archivePrefix={arXiv},
|
|
915
926
|
primaryClass={cs.CL},
|
|
916
|
-
url={https://arxiv.org/abs/2506.20923},
|
|
927
|
+
url={https://arxiv.org/abs/2506.20923},
|
|
917
928
|
}
|
|
918
929
|
|
|
919
930
|
@misc{hu2025kalmembedding,
|
|
920
|
-
title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
|
|
931
|
+
title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
|
|
921
932
|
author={Xinshuo Hu and Zifei Shan and Xinping Zhao and Zetian Sun and Zhenyu Liu and Dongfang Li and Shaolin Ye and Xinyuan Wei and Qian Chen and Baotian Hu and Haofen Wang and Jun Yu and Min Zhang},
|
|
922
933
|
year={2025},
|
|
923
934
|
eprint={2501.01028},
|
|
924
935
|
archivePrefix={arXiv},
|
|
925
936
|
primaryClass={cs.CL},
|
|
926
|
-
url={https://arxiv.org/abs/2501.01028},
|
|
937
|
+
url={https://arxiv.org/abs/2501.01028},
|
|
927
938
|
}""",
|
|
928
939
|
)
|
|
929
940
|
|
|
@@ -942,6 +953,7 @@ KaLM_Embedding_gemma_3_12b_2511 = ModelMeta(
|
|
|
942
953
|
open_weights=True,
|
|
943
954
|
release_date="2025-11-06",
|
|
944
955
|
n_parameters=11.76 * 1e9,
|
|
956
|
+
n_embedding_parameters=None,
|
|
945
957
|
memory_usage_mb=44884,
|
|
946
958
|
max_tokens=32768,
|
|
947
959
|
embed_dim=3840,
|
|
@@ -954,22 +966,22 @@ KaLM_Embedding_gemma_3_12b_2511 = ModelMeta(
|
|
|
954
966
|
public_training_data=None,
|
|
955
967
|
training_datasets=KaLM_Embedding_gemma_3_12b_training_data,
|
|
956
968
|
citation="""@misc{zhao2025kalmembeddingv2,
|
|
957
|
-
title={KaLM-Embedding-V2: Superior Training Techniques and Data Inspire A Versatile Embedding Model},
|
|
969
|
+
title={KaLM-Embedding-V2: Superior Training Techniques and Data Inspire A Versatile Embedding Model},
|
|
958
970
|
author={Xinping Zhao and Xinshuo Hu and Zifei Shan and Shouzheng Huang and Yao Zhou and Xin Zhang and Zetian Sun and Zhenyu Liu and Dongfang Li and Xinyuan Wei and Youcheng Pan and Yang Xiang and Meishan Zhang and Haofen Wang and Jun Yu and Baotian Hu and Min Zhang},
|
|
959
971
|
year={2025},
|
|
960
972
|
eprint={2506.20923},
|
|
961
973
|
archivePrefix={arXiv},
|
|
962
974
|
primaryClass={cs.CL},
|
|
963
|
-
url={https://arxiv.org/abs/2506.20923},
|
|
975
|
+
url={https://arxiv.org/abs/2506.20923},
|
|
964
976
|
}
|
|
965
977
|
|
|
966
978
|
@misc{hu2025kalmembedding,
|
|
967
|
-
title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
|
|
979
|
+
title={KaLM-Embedding: Superior Training Data Brings A Stronger Embedding Model},
|
|
968
980
|
author={Xinshuo Hu and Zifei Shan and Xinping Zhao and Zetian Sun and Zhenyu Liu and Dongfang Li and Shaolin Ye and Xinyuan Wei and Qian Chen and Baotian Hu and Haofen Wang and Jun Yu and Min Zhang},
|
|
969
981
|
year={2025},
|
|
970
982
|
eprint={2501.01028},
|
|
971
983
|
archivePrefix={arXiv},
|
|
972
984
|
primaryClass={cs.CL},
|
|
973
|
-
url={https://arxiv.org/abs/2501.01028},
|
|
985
|
+
url={https://arxiv.org/abs/2501.01028},
|
|
974
986
|
}""",
|
|
975
987
|
)
|
|
@@ -10,22 +10,29 @@ sbert_swedish = ModelMeta(
|
|
|
10
10
|
revision="6b5e83cd29c03729cfdc33d13b1423399b0efb5c",
|
|
11
11
|
release_date="2023-01-11",
|
|
12
12
|
n_parameters=124690944,
|
|
13
|
+
n_embedding_parameters=38_649_600,
|
|
13
14
|
memory_usage_mb=476,
|
|
14
15
|
embed_dim=768,
|
|
15
16
|
license="apache-2.0",
|
|
16
17
|
max_tokens=384,
|
|
17
18
|
reference="https://huggingface.co/KBLab/sentence-bert-swedish-cased",
|
|
18
19
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
19
|
-
framework=[
|
|
20
|
+
framework=[
|
|
21
|
+
"Sentence Transformers",
|
|
22
|
+
"PyTorch",
|
|
23
|
+
"safetensors",
|
|
24
|
+
"GGUF",
|
|
25
|
+
"Transformers",
|
|
26
|
+
],
|
|
20
27
|
use_instructions=False,
|
|
21
28
|
public_training_code=None,
|
|
22
29
|
public_training_data=None,
|
|
23
30
|
training_datasets=None,
|
|
24
31
|
adapted_from="sentence-transformers/all-mpnet-base-v2",
|
|
25
|
-
citation="""@misc{rekathati2021introducing,
|
|
26
|
-
author = {Rekathati, Faton},
|
|
27
|
-
title = {The KBLab Blog: Introducing a Swedish Sentence Transformer},
|
|
28
|
-
url = {https://kb-labb.github.io/posts/2021-08-23-a-swedish-sentence-transformer/},
|
|
29
|
-
year = {2021}
|
|
32
|
+
citation="""@misc{rekathati2021introducing,
|
|
33
|
+
author = {Rekathati, Faton},
|
|
34
|
+
title = {The KBLab Blog: Introducing a Swedish Sentence Transformer},
|
|
35
|
+
url = {https://kb-labb.github.io/posts/2021-08-23-a-swedish-sentence-transformer/},
|
|
36
|
+
year = {2021}
|
|
30
37
|
}""",
|
|
31
38
|
)
|
|
@@ -4,7 +4,7 @@ from mteb.models.sentence_transformer_wrapper import (
|
|
|
4
4
|
)
|
|
5
5
|
|
|
6
6
|
dfm_enc_large = ModelMeta(
|
|
7
|
-
loader=sentence_transformers_loader,
|
|
7
|
+
loader=sentence_transformers_loader,
|
|
8
8
|
name="KennethEnevoldsen/dfm-sentence-encoder-large",
|
|
9
9
|
model_type=["dense"],
|
|
10
10
|
languages=["dan-Latn"],
|
|
@@ -12,13 +12,14 @@ dfm_enc_large = ModelMeta(
|
|
|
12
12
|
revision="132c53391e7a780dc6a2f9a03724d0158fe7122c",
|
|
13
13
|
release_date="2023-07-12",
|
|
14
14
|
n_parameters=355087360,
|
|
15
|
+
n_embedding_parameters=51_200_000,
|
|
15
16
|
memory_usage_mb=1554,
|
|
16
17
|
embed_dim=1024,
|
|
17
18
|
license="mit",
|
|
18
19
|
max_tokens=512,
|
|
19
20
|
reference="https://huggingface.co/KennethEnevoldsen/dfm-sentence-encoder-large",
|
|
20
21
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
21
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
22
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
|
|
22
23
|
use_instructions=False,
|
|
23
24
|
superseded_by=None,
|
|
24
25
|
adapted_from="chcaa/dfm-encoder-large-v1",
|
|
@@ -39,7 +40,7 @@ dfm_enc_large = ModelMeta(
|
|
|
39
40
|
)
|
|
40
41
|
|
|
41
42
|
dfm_enc_med = ModelMeta(
|
|
42
|
-
loader=sentence_transformers_loader,
|
|
43
|
+
loader=sentence_transformers_loader,
|
|
43
44
|
name="KennethEnevoldsen/dfm-sentence-encoder-medium",
|
|
44
45
|
model_type=["dense"],
|
|
45
46
|
languages=["dan-Latn"],
|
|
@@ -47,13 +48,14 @@ dfm_enc_med = ModelMeta(
|
|
|
47
48
|
revision="701bce95d499fa97610d57e8823c54fd1fb79930",
|
|
48
49
|
release_date="2023-07-12",
|
|
49
50
|
n_parameters=124445952,
|
|
51
|
+
n_embedding_parameters=38_403_840,
|
|
50
52
|
memory_usage_mb=475,
|
|
51
53
|
embed_dim=768,
|
|
52
54
|
license="mit",
|
|
53
55
|
max_tokens=512,
|
|
54
56
|
reference="https://huggingface.co/KennethEnevoldsen/dfm-sentence-encoder-medium",
|
|
55
57
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
56
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
58
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers"],
|
|
57
59
|
use_instructions=False,
|
|
58
60
|
superseded_by=None,
|
|
59
61
|
adapted_from=None,
|
|
@@ -10,13 +10,14 @@ xlmr_scandi = ModelMeta(
|
|
|
10
10
|
revision="d40c10ca7b1e68b5a8372f2d112dac9eb3279df1",
|
|
11
11
|
release_date="2022-02-22",
|
|
12
12
|
n_parameters=278043648,
|
|
13
|
+
n_embedding_parameters=192_001_536,
|
|
13
14
|
memory_usage_mb=1061,
|
|
14
15
|
embed_dim=768,
|
|
15
16
|
license="not specified",
|
|
16
17
|
max_tokens=512,
|
|
17
18
|
reference="https://huggingface.co/KFST/XLMRoberta-en-da-sv-nb",
|
|
18
19
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
19
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
20
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
20
21
|
use_instructions=False,
|
|
21
22
|
public_training_code=None,
|
|
22
23
|
public_training_data=None,
|
|
@@ -9,13 +9,14 @@ kowshik24_bangla_embedding_model = ModelMeta(
|
|
|
9
9
|
revision="6689c21e69be5950596bad084457cbaa138728d8",
|
|
10
10
|
release_date="2025-11-10",
|
|
11
11
|
n_parameters=278_000_000,
|
|
12
|
+
n_embedding_parameters=192_001_536,
|
|
12
13
|
memory_usage_mb=1061,
|
|
13
14
|
embed_dim=768,
|
|
14
15
|
license="apache-2.0",
|
|
15
16
|
max_tokens=128,
|
|
16
17
|
reference="https://huggingface.co/Kowshik24/bangla-sentence-transformer-ft-matryoshka-paraphrase-multilingual-mpnet-base-v2",
|
|
17
18
|
similarity_fn_name="cosine",
|
|
18
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
19
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
19
20
|
use_instructions=False,
|
|
20
21
|
public_training_code="https://github.com/kowshik24/Bangla-Embedding",
|
|
21
22
|
public_training_data="https://huggingface.co/datasets/sartajekram/BanglaRQA",
|
|
@@ -18,6 +18,7 @@ lens_d4000 = ModelMeta(
|
|
|
18
18
|
revision="e473b33364e6c48a324796fd1411d3b93670c6fe",
|
|
19
19
|
release_date="2025-01-17",
|
|
20
20
|
n_parameters=int(7.11 * 1e9),
|
|
21
|
+
n_embedding_parameters=131_084_288,
|
|
21
22
|
memory_usage_mb=27125,
|
|
22
23
|
embed_dim=4000,
|
|
23
24
|
license="apache-2.0",
|
|
@@ -41,6 +42,7 @@ lens_d8000 = ModelMeta(
|
|
|
41
42
|
revision="a0b87bd91cb27b6f2f0b0fe22c28026da1d464ef",
|
|
42
43
|
release_date="2025-01-17",
|
|
43
44
|
n_parameters=int(7.11 * 1e9),
|
|
45
|
+
n_embedding_parameters=131_084_288,
|
|
44
46
|
memory_usage_mb=27125,
|
|
45
47
|
embed_dim=8000,
|
|
46
48
|
license="apache-2.0",
|
|
@@ -52,13 +52,14 @@ lgai_embedding_en = ModelMeta(
|
|
|
52
52
|
revision="5e0b2316acc8c2e2941ded6b9cb200b1cb313e65",
|
|
53
53
|
release_date="2025-06-11",
|
|
54
54
|
n_parameters=7_110_000_000,
|
|
55
|
+
n_embedding_parameters=131_084_288,
|
|
55
56
|
memory_usage_mb=27125,
|
|
56
57
|
embed_dim=4096,
|
|
57
58
|
license="apache-2.0",
|
|
58
59
|
max_tokens=32768,
|
|
59
60
|
reference="https://huggingface.co/annamodels/LGAI-Embedding-Preview",
|
|
60
61
|
similarity_fn_name="cosine",
|
|
61
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
62
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
62
63
|
use_instructions=True,
|
|
63
64
|
public_training_code=None,
|
|
64
65
|
public_training_data=None,
|
|
@@ -1,11 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
1
5
|
import torch
|
|
2
6
|
|
|
3
7
|
from mteb.models.instruct_wrapper import instruct_wrapper
|
|
4
8
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
5
|
-
from mteb.types import PromptType
|
|
6
9
|
|
|
7
10
|
from .e5_instruct import E5_MISTRAL_TRAINING_DATA
|
|
8
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from mteb.types import PromptType
|
|
9
14
|
LINQ_EMBED_MISTRAL_CITATION = """@misc{LinqAIResearch2024,
|
|
10
15
|
title={Linq-Embed-Mistral:Elevating Text Retrieval with Improved GPT Data Through Task-Specific Control and Quality Refinement},
|
|
11
16
|
author={Junseong Kim and Seolhwa Lee and Jihoon Kwon and Sangmo Gu and Yejin Kim and Minkyung Cho and Jy-yong Sohn and Chanyeol Choi},
|
|
@@ -38,13 +43,14 @@ Linq_Embed_Mistral = ModelMeta(
|
|
|
38
43
|
revision="0c1a0b0589177079acc552433cad51d7c9132379",
|
|
39
44
|
release_date="2024-05-29", # initial commit of hf model.
|
|
40
45
|
n_parameters=7_110_000_000,
|
|
46
|
+
n_embedding_parameters=None,
|
|
41
47
|
memory_usage_mb=13563,
|
|
42
48
|
embed_dim=4096,
|
|
43
49
|
license="cc-by-nc-4.0",
|
|
44
50
|
max_tokens=32768,
|
|
45
51
|
reference="https://huggingface.co/Linq-AI-Research/Linq-Embed-Mistral",
|
|
46
52
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
47
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
53
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
48
54
|
use_instructions=True,
|
|
49
55
|
public_training_code=None,
|
|
50
56
|
public_training_data=None,
|
|
@@ -1,14 +1,19 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
|
-
from torch.utils.data import DataLoader
|
|
5
6
|
|
|
6
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
7
7
|
from mteb.models.model_meta import ModelMeta
|
|
8
|
-
from mteb.types import BatchedInput, PromptType
|
|
9
8
|
|
|
10
9
|
from .rerankers_custom import RerankerWrapper
|
|
11
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from torch.utils.data import DataLoader
|
|
13
|
+
|
|
14
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
|
+
from mteb.types import BatchedInput, PromptType
|
|
16
|
+
|
|
12
17
|
LISTCONRANKER_CITATION = """@article{liu2025listconranker,
|
|
13
18
|
title={ListConRanker: A Contrastive Text Reranker with Listwise Encoding},
|
|
14
19
|
author={Liu, Junlong and Ma, Yue and Zhao, Ruihui and Zheng, Junhao and Ma, Qianli and Kang, Yangyang},
|
|
@@ -118,6 +123,7 @@ listconranker = ModelMeta(
|
|
|
118
123
|
revision="95ae6a5f422a916bc36520f0f3e198e7d91520a0",
|
|
119
124
|
release_date="2024-12-11",
|
|
120
125
|
n_parameters=401_000_000,
|
|
126
|
+
n_embedding_parameters=None,
|
|
121
127
|
memory_usage_mb=1242,
|
|
122
128
|
similarity_fn_name="cosine",
|
|
123
129
|
training_datasets=listconranker_training_datasets,
|
|
@@ -125,7 +131,7 @@ listconranker = ModelMeta(
|
|
|
125
131
|
license="mit",
|
|
126
132
|
max_tokens=512,
|
|
127
133
|
reference="https://huggingface.co/ByteDance/ListConRanker",
|
|
128
|
-
framework=["PyTorch"],
|
|
134
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors", "Transformers"],
|
|
129
135
|
use_instructions=False,
|
|
130
136
|
public_training_code=None,
|
|
131
137
|
public_training_data=None,
|