mteb 2.7.2__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +63 -14
- mteb/_evaluators/any_sts_evaluator.py +12 -5
- mteb/_evaluators/clustering_evaluator.py +12 -4
- mteb/_evaluators/evaluator.py +11 -5
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +14 -5
- mteb/_evaluators/pair_classification_evaluator.py +13 -5
- mteb/_evaluators/retrieval_evaluator.py +22 -13
- mteb/_evaluators/retrieval_metrics.py +9 -3
- mteb/_evaluators/sklearn_evaluator.py +20 -11
- mteb/_evaluators/text/bitext_mining_evaluator.py +10 -3
- mteb/_evaluators/text/summarization_evaluator.py +10 -4
- mteb/_evaluators/zeroshot_classification_evaluator.py +12 -3
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +8 -2
- mteb/abstasks/_data_filter/task_pipelines.py +7 -2
- mteb/abstasks/_statistics_calculation.py +6 -4
- mteb/abstasks/abstask.py +48 -21
- mteb/abstasks/aggregate_task_metadata.py +20 -9
- mteb/abstasks/aggregated_task.py +15 -8
- mteb/abstasks/classification.py +25 -9
- mteb/abstasks/clustering.py +23 -10
- mteb/abstasks/clustering_legacy.py +22 -8
- mteb/abstasks/image/image_text_pair_classification.py +23 -9
- mteb/abstasks/multilabel_classification.py +13 -5
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +14 -6
- mteb/abstasks/retrieval.py +56 -30
- mteb/abstasks/retrieval_dataset_loaders.py +48 -37
- mteb/abstasks/sts.py +29 -13
- mteb/abstasks/task_metadata.py +17 -8
- mteb/abstasks/text/bitext_mining.py +23 -12
- mteb/abstasks/text/reranking.py +2 -2
- mteb/abstasks/text/summarization.py +19 -8
- mteb/abstasks/zeroshot_classification.py +23 -9
- mteb/benchmarks/_create_table.py +13 -7
- mteb/benchmarks/benchmark.py +11 -1
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +41 -2
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/cache.py +10 -5
- mteb/cli/_display_tasks.py +9 -3
- mteb/cli/build_cli.py +5 -2
- mteb/cli/generate_model_card.py +9 -2
- mteb/deprecated_evaluator.py +16 -12
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/evaluate.py +33 -20
- mteb/filter_tasks.py +12 -7
- mteb/get_tasks.py +9 -4
- mteb/languages/language_scripts.py +8 -3
- mteb/leaderboard/app.py +11 -4
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +9 -3
- mteb/models/abs_encoder.py +22 -12
- mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
- mteb/models/cache_wrappers/cache_wrapper.py +14 -9
- mteb/models/get_model_meta.py +32 -6
- mteb/models/instruct_wrapper.py +13 -5
- mteb/models/model_implementations/align_models.py +10 -4
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +2 -0
- mteb/models/model_implementations/ara_models.py +1 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +40 -1
- mteb/models/model_implementations/bica_model.py +1 -0
- mteb/models/model_implementations/blip2_models.py +11 -4
- mteb/models/model_implementations/blip_models.py +17 -4
- mteb/models/model_implementations/bm25.py +24 -14
- mteb/models/model_implementations/bmretriever_models.py +10 -2
- mteb/models/model_implementations/cadet_models.py +1 -0
- mteb/models/model_implementations/cde_models.py +11 -5
- mteb/models/model_implementations/clip_models.py +12 -4
- mteb/models/model_implementations/clips_models.py +3 -0
- mteb/models/model_implementations/codefuse_models.py +5 -0
- mteb/models/model_implementations/codesage_models.py +3 -0
- mteb/models/model_implementations/cohere_models.py +14 -4
- mteb/models/model_implementations/cohere_v.py +14 -4
- mteb/models/model_implementations/colpali_models.py +7 -3
- mteb/models/model_implementations/colqwen_models.py +17 -31
- mteb/models/model_implementations/colsmol_models.py +3 -1
- mteb/models/model_implementations/conan_models.py +11 -4
- mteb/models/model_implementations/dino_models.py +28 -4
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +9 -0
- mteb/models/model_implementations/e5_v.py +10 -4
- mteb/models/model_implementations/eagerworks_models.py +11 -4
- mteb/models/model_implementations/emillykkejensen_models.py +3 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +1 -0
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +9 -0
- mteb/models/model_implementations/facebookai.py +2 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +7 -3
- mteb/models/model_implementations/google_models.py +15 -4
- mteb/models/model_implementations/granite_vision_embedding_models.py +7 -5
- mteb/models/model_implementations/gritlm_models.py +3 -0
- mteb/models/model_implementations/gte_models.py +9 -0
- mteb/models/model_implementations/hinvec_models.py +6 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +14 -5
- mteb/models/model_implementations/jina_clip.py +10 -4
- mteb/models/model_implementations/jina_models.py +17 -5
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +1 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
- mteb/models/model_implementations/kfst.py +1 -0
- mteb/models/model_implementations/kowshik24_models.py +1 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +7 -1
- mteb/models/model_implementations/listconranker.py +10 -4
- mteb/models/model_implementations/llm2clip_models.py +12 -4
- mteb/models/model_implementations/llm2vec_models.py +20 -6
- mteb/models/model_implementations/mcinext_models.py +8 -2
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +63 -0
- mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +11 -4
- mteb/models/model_implementations/mod_models.py +2 -1
- mteb/models/model_implementations/model2vec_models.py +23 -4
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/nbailab.py +3 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +13 -5
- mteb/models/model_implementations/nomic_models.py +17 -4
- mteb/models/model_implementations/nomic_models_vision.py +5 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +9 -3
- mteb/models/model_implementations/nvidia_models.py +15 -4
- mteb/models/model_implementations/octen_models.py +3 -1
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +17 -4
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +15 -4
- mteb/models/model_implementations/ops_moa_models.py +9 -2
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
- mteb/models/model_implementations/pawan_models.py +1 -0
- mteb/models/model_implementations/piccolo_models.py +2 -0
- mteb/models/model_implementations/promptriever_models.py +16 -6
- mteb/models/model_implementations/pylate_models.py +32 -13
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +11 -1
- mteb/models/model_implementations/qzhou_models.py +2 -0
- mteb/models/model_implementations/random_baseline.py +4 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -0
- mteb/models/model_implementations/reasonir_model.py +65 -0
- mteb/models/model_implementations/repllama_models.py +15 -6
- mteb/models/model_implementations/rerankers_custom.py +13 -4
- mteb/models/model_implementations/rerankers_monot5_based.py +24 -4
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +20 -0
- mteb/models/model_implementations/ruri_models.py +10 -0
- mteb/models/model_implementations/salesforce_models.py +10 -1
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +6 -2
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +18 -0
- mteb/models/model_implementations/shuu_model.py +1 -0
- mteb/models/model_implementations/siglip_models.py +19 -4
- mteb/models/model_implementations/slm_models.py +7 -4
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +2 -0
- mteb/models/model_implementations/text2vec_models.py +3 -0
- mteb/models/model_implementations/ua_sentence_models.py +1 -0
- mteb/models/model_implementations/uae_models.py +10 -4
- mteb/models/model_implementations/vdr_models.py +8 -1
- mteb/models/model_implementations/vi_vn_models.py +6 -0
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +11 -4
- mteb/models/model_implementations/voyage_models.py +52 -4
- mteb/models/model_implementations/voyage_v.py +11 -6
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +1 -0
- mteb/models/model_implementations/yuan_models_en.py +2 -1
- mteb/models/model_meta.py +47 -9
- mteb/models/models_protocols.py +23 -18
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
- mteb/models/search_wrappers.py +31 -12
- mteb/models/sentence_transformer_wrapper.py +4 -3
- mteb/models/vllm_wrapper.py +8 -6
- mteb/results/benchmark_results.py +22 -17
- mteb/results/model_result.py +21 -15
- mteb/results/task_result.py +32 -16
- mteb/similarity_functions.py +8 -2
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
- mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
- mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
- mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
- mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
- mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
- mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
- mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
- mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
- mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
- mteb/tasks/classification/fra/french_book_reviews.py +2 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
- mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
- mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
- mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
- mteb/tasks/classification/jpn/wrime_classification.py +1 -1
- mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
- mteb/tasks/classification/kor/klue_tc.py +2 -2
- mteb/tasks/classification/kor/kor_fin.py +1 -1
- mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
- mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
- mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
- mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
- mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
- mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
- mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
- mteb/tasks/classification/ory/odia_news_classification.py +2 -2
- mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
- mteb/tasks/classification/ron/moroco.py +1 -1
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
- mteb/tasks/classification/rus/georeview_classification.py +1 -1
- mteb/tasks/classification/rus/headline_classification.py +2 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
- mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
- mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
- mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
- mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
- mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
- mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
- mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
- mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
- mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
- mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
- mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
- mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
- mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
- mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
- mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
- mteb/tasks/clustering/nob/snl_clustering.py +8 -3
- mteb/tasks/clustering/nob/vg_clustering.py +8 -3
- mteb/tasks/clustering/pol/polish_clustering.py +3 -3
- mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
- mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +4 -4
- mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
- mteb/tasks/multichoice/eng/cv_bench.py +4 -4
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
- mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
- mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
- mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
- mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
- mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
- mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
- mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
- mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
- mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
- mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
- mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
- mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
- mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
- mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
- mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
- mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
- mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
- mteb/tasks/pair_classification/rus/terra.py +2 -2
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
- mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
- mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +4 -4
- mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
- mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
- mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
- mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
- mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
- mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +1 -1
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
- mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
- mteb/tasks/retrieval/eng/__init__.py +42 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/ml_questions.py +1 -1
- mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
- mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
- mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
- mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
- mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
- mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
- mteb/tasks/retrieval/nob/norquad.py +1 -1
- mteb/tasks/retrieval/nob/snl_retrieval.py +1 -1
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
- mteb/tasks/sts/fao/faroese_sts.py +1 -1
- mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
- mteb/tasks/sts/kor/klue_sts.py +1 -1
- mteb/tasks/sts/por/sick_br_sts.py +1 -1
- mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
- mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
- mteb/types/_encoder_io.py +1 -1
- mteb/types/statistics.py +9 -2
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/METADATA +1 -1
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/RECORD +486 -465
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,13 @@
|
|
|
1
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
2
4
|
|
|
3
5
|
from mteb.models.abs_encoder import AbsEncoder
|
|
4
6
|
from mteb.models.model_meta import ModelMeta
|
|
5
7
|
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from mteb.types import Array
|
|
10
|
+
|
|
6
11
|
|
|
7
12
|
class OPSWrapper(AbsEncoder):
|
|
8
13
|
def __init__(self, model_name: str, revision: str):
|
|
@@ -15,7 +20,7 @@ class OPSWrapper(AbsEncoder):
|
|
|
15
20
|
)
|
|
16
21
|
self.output_dim = 1536
|
|
17
22
|
|
|
18
|
-
def encode(self, sentences: list[str], **kwargs) ->
|
|
23
|
+
def encode(self, sentences: list[str], **kwargs) -> Array:
|
|
19
24
|
embeddings = self.model.encode(sentences, **kwargs)
|
|
20
25
|
return embeddings[:, : self.output_dim]
|
|
21
26
|
|
|
@@ -28,6 +33,7 @@ ops_moa_conan_embedding = ModelMeta(
|
|
|
28
33
|
languages=["zho-Hans"],
|
|
29
34
|
loader=OPSWrapper,
|
|
30
35
|
n_parameters=int(343 * 1e6),
|
|
36
|
+
n_embedding_parameters=21_635_072,
|
|
31
37
|
memory_usage_mb=1308,
|
|
32
38
|
max_tokens=512,
|
|
33
39
|
embed_dim=1536,
|
|
@@ -60,6 +66,7 @@ ops_moa_yuan_embedding = ModelMeta(
|
|
|
60
66
|
languages=["zho-Hans"],
|
|
61
67
|
loader=OPSWrapper,
|
|
62
68
|
n_parameters=int(343 * 1e6),
|
|
69
|
+
n_embedding_parameters=21_635_072,
|
|
63
70
|
memory_usage_mb=1242,
|
|
64
71
|
max_tokens=512,
|
|
65
72
|
embed_dim=1536,
|
|
@@ -12,6 +12,7 @@ piccolo_base_zh = ModelMeta(
|
|
|
12
12
|
revision="47c0a63b8f667c3482e05b2fd45577bb19252196",
|
|
13
13
|
release_date="2023-09-04", # first commit
|
|
14
14
|
n_parameters=None,
|
|
15
|
+
n_embedding_parameters=16_226_304,
|
|
15
16
|
memory_usage_mb=None, # can't see on model card
|
|
16
17
|
embed_dim=768,
|
|
17
18
|
license="mit",
|
|
@@ -37,6 +38,7 @@ piccolo_large_zh_v2 = ModelMeta(
|
|
|
37
38
|
revision="05948c1d889355936bdf9db7d30df57dd78d25a3",
|
|
38
39
|
release_date="2024-04-22", # first commit
|
|
39
40
|
n_parameters=None,
|
|
41
|
+
n_embedding_parameters=None,
|
|
40
42
|
memory_usage_mb=None, # we don't know because they removed the model
|
|
41
43
|
embed_dim=1024,
|
|
42
44
|
license="not specified",
|
|
@@ -1,15 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
4
5
|
|
|
5
6
|
import torch
|
|
6
|
-
from torch.utils.data import DataLoader
|
|
7
7
|
|
|
8
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
8
|
from mteb.models.abs_encoder import AbsEncoder
|
|
10
9
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
|
|
14
|
+
from torch.utils.data import DataLoader
|
|
15
|
+
|
|
16
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
18
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
13
19
|
|
|
14
20
|
from .repllama_models import RepLLaMAModel, model_prompts
|
|
15
21
|
|
|
@@ -81,6 +87,7 @@ promptriever_llama2 = ModelMeta(
|
|
|
81
87
|
revision="01c7f73d771dfac7d292323805ebc428287df4f9-30b14e3813c0fa45facfd01a594580c3fe5ecf23", # base-peft revision
|
|
82
88
|
release_date="2024-09-15",
|
|
83
89
|
n_parameters=7_000_000_000,
|
|
90
|
+
n_embedding_parameters=None,
|
|
84
91
|
memory_usage_mb=26703,
|
|
85
92
|
max_tokens=4096,
|
|
86
93
|
embed_dim=4096,
|
|
@@ -117,6 +124,7 @@ promptriever_llama3 = ModelMeta(
|
|
|
117
124
|
},
|
|
118
125
|
release_date="2024-09-15",
|
|
119
126
|
n_parameters=8_000_000_000,
|
|
127
|
+
n_embedding_parameters=None,
|
|
120
128
|
memory_usage_mb=30518,
|
|
121
129
|
max_tokens=8192,
|
|
122
130
|
embed_dim=4096,
|
|
@@ -146,6 +154,7 @@ promptriever_llama3_instruct = ModelMeta(
|
|
|
146
154
|
revision="5206a32e0bd3067aef1ce90f5528ade7d866253f-8b677258615625122c2eb7329292b8c402612c21", # base-peft revision
|
|
147
155
|
release_date="2024-09-15",
|
|
148
156
|
n_parameters=8_000_000_000,
|
|
157
|
+
n_embedding_parameters=None,
|
|
149
158
|
memory_usage_mb=30518,
|
|
150
159
|
max_tokens=8192,
|
|
151
160
|
embed_dim=4096,
|
|
@@ -179,6 +188,7 @@ promptriever_mistral_v1 = ModelMeta(
|
|
|
179
188
|
revision="7231864981174d9bee8c7687c24c8344414eae6b-876d63e49b6115ecb6839893a56298fadee7e8f5", # base-peft revision
|
|
180
189
|
release_date="2024-09-15",
|
|
181
190
|
n_parameters=7_000_000_000,
|
|
191
|
+
n_embedding_parameters=131_072_000,
|
|
182
192
|
memory_usage_mb=26703,
|
|
183
193
|
training_datasets={
|
|
184
194
|
# "samaya-ai/msmarco-w-instructions",
|
|
@@ -1,30 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import heapq
|
|
2
4
|
import logging
|
|
3
5
|
import shutil
|
|
4
6
|
import tempfile
|
|
5
7
|
from pathlib import Path
|
|
6
|
-
from typing import Any
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
7
9
|
|
|
8
10
|
import torch
|
|
9
|
-
from torch.utils.data import DataLoader
|
|
10
11
|
|
|
11
12
|
from mteb._create_dataloaders import (
|
|
12
13
|
create_dataloader,
|
|
13
14
|
)
|
|
14
15
|
from mteb._requires_package import requires_package
|
|
15
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
16
16
|
from mteb.models.abs_encoder import AbsEncoder
|
|
17
17
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
18
|
-
from mteb.types import
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
18
|
+
from mteb.types import PromptType
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from torch.utils.data import DataLoader
|
|
22
|
+
|
|
23
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
24
|
+
from mteb.types import (
|
|
25
|
+
Array,
|
|
26
|
+
BatchedInput,
|
|
27
|
+
CorpusDatasetType,
|
|
28
|
+
EncodeKwargs,
|
|
29
|
+
QueryDatasetType,
|
|
30
|
+
RetrievalOutputType,
|
|
31
|
+
TopRankedDocumentsType,
|
|
32
|
+
)
|
|
33
|
+
|
|
28
34
|
|
|
29
35
|
logger = logging.getLogger(__name__)
|
|
30
36
|
|
|
@@ -47,6 +53,7 @@ class PylateSearchEncoder:
|
|
|
47
53
|
hf_split: str,
|
|
48
54
|
hf_subset: str,
|
|
49
55
|
encode_kwargs: EncodeKwargs,
|
|
56
|
+
num_proc: int,
|
|
50
57
|
) -> None:
|
|
51
58
|
"""Index the corpus for retrieval.
|
|
52
59
|
|
|
@@ -56,6 +63,7 @@ class PylateSearchEncoder:
|
|
|
56
63
|
hf_split: Split of current task, allows to know some additional information about current split.
|
|
57
64
|
hf_subset: Subset of current task. Similar to `hf_split` to get more information
|
|
58
65
|
encode_kwargs: Additional arguments to pass to the encoder during indexing.
|
|
66
|
+
num_proc: Number of processes to use for indexing.
|
|
59
67
|
"""
|
|
60
68
|
self.task_corpus = corpus
|
|
61
69
|
|
|
@@ -81,12 +89,14 @@ class PylateSearchEncoder:
|
|
|
81
89
|
top_k: int,
|
|
82
90
|
encode_kwargs: EncodeKwargs,
|
|
83
91
|
top_ranked: TopRankedDocumentsType | None = None,
|
|
92
|
+
num_proc: int,
|
|
84
93
|
) -> RetrievalOutputType:
|
|
85
94
|
queries_dataloader = create_dataloader(
|
|
86
95
|
queries,
|
|
87
96
|
task_metadata,
|
|
88
97
|
prompt_type=PromptType.query,
|
|
89
98
|
batch_size=encode_kwargs.get("batch_size", 32),
|
|
99
|
+
num_proc=num_proc,
|
|
90
100
|
)
|
|
91
101
|
|
|
92
102
|
query_embeddings = self.encode(
|
|
@@ -110,6 +120,7 @@ class PylateSearchEncoder:
|
|
|
110
120
|
hf_subset=hf_subset,
|
|
111
121
|
hf_split=hf_split,
|
|
112
122
|
encode_kwargs=encode_kwargs,
|
|
123
|
+
num_proc=num_proc,
|
|
113
124
|
)
|
|
114
125
|
else:
|
|
115
126
|
result_heaps = self._pylate_full_corpus_search(
|
|
@@ -120,6 +131,7 @@ class PylateSearchEncoder:
|
|
|
120
131
|
hf_subset=hf_subset,
|
|
121
132
|
hf_split=hf_split,
|
|
122
133
|
encode_kwargs=encode_kwargs,
|
|
134
|
+
num_proc=num_proc,
|
|
123
135
|
)
|
|
124
136
|
|
|
125
137
|
results = {qid: {} for qid in query_idx_to_id.values()}
|
|
@@ -138,6 +150,7 @@ class PylateSearchEncoder:
|
|
|
138
150
|
hf_split: str,
|
|
139
151
|
top_k: int,
|
|
140
152
|
encode_kwargs: EncodeKwargs,
|
|
153
|
+
num_proc: int,
|
|
141
154
|
) -> dict[str, list[tuple[float, str]]]:
|
|
142
155
|
from pylate import indexes, retrieve
|
|
143
156
|
|
|
@@ -164,6 +177,7 @@ class PylateSearchEncoder:
|
|
|
164
177
|
task_metadata,
|
|
165
178
|
prompt_type=PromptType.document,
|
|
166
179
|
batch_size=encode_kwargs.get("batch_size", 32),
|
|
180
|
+
num_proc=num_proc,
|
|
167
181
|
)
|
|
168
182
|
documents_embeddings = self.encode(
|
|
169
183
|
documents_loader,
|
|
@@ -202,6 +216,7 @@ class PylateSearchEncoder:
|
|
|
202
216
|
hf_subset: str,
|
|
203
217
|
hf_split: str,
|
|
204
218
|
encode_kwargs: EncodeKwargs,
|
|
219
|
+
num_proc: int = 1,
|
|
205
220
|
) -> dict[str, list[tuple[float, str]]]:
|
|
206
221
|
"""Rerank with PyLate's rank.rerank using per-query candidates.
|
|
207
222
|
|
|
@@ -224,6 +239,7 @@ class PylateSearchEncoder:
|
|
|
224
239
|
task_metadata,
|
|
225
240
|
prompt_type=PromptType.document,
|
|
226
241
|
batch_size=encode_kwargs.get("batch_size", 32),
|
|
242
|
+
num_proc=num_proc,
|
|
227
243
|
),
|
|
228
244
|
task_metadata=task_metadata,
|
|
229
245
|
hf_split=hf_split,
|
|
@@ -346,6 +362,7 @@ colbert_v2 = ModelMeta(
|
|
|
346
362
|
public_training_data=None,
|
|
347
363
|
release_date="2024-09-21",
|
|
348
364
|
n_parameters=int(110 * 1e6),
|
|
365
|
+
n_embedding_parameters=23_440_896,
|
|
349
366
|
memory_usage_mb=418,
|
|
350
367
|
max_tokens=180,
|
|
351
368
|
embed_dim=None,
|
|
@@ -402,6 +419,7 @@ jina_colbert_v2 = ModelMeta(
|
|
|
402
419
|
public_training_data=None,
|
|
403
420
|
release_date="2024-08-16",
|
|
404
421
|
n_parameters=int(559 * 1e6),
|
|
422
|
+
n_embedding_parameters=None,
|
|
405
423
|
memory_usage_mb=1067,
|
|
406
424
|
max_tokens=8192,
|
|
407
425
|
embed_dim=None,
|
|
@@ -458,6 +476,7 @@ lightonai__gte_moderncolbert_v1 = ModelMeta(
|
|
|
458
476
|
public_training_data="https://huggingface.co/datasets/lightonai/ms-marco-en-bge-gemma",
|
|
459
477
|
release_date="2025-04-30",
|
|
460
478
|
n_parameters=int(149 * 1e6),
|
|
479
|
+
n_embedding_parameters=None,
|
|
461
480
|
memory_usage_mb=None,
|
|
462
481
|
max_tokens=8192,
|
|
463
482
|
embed_dim=None,
|
|
@@ -36,6 +36,7 @@ Qodo_Embed_1_1_5B = ModelMeta(
|
|
|
36
36
|
revision="84bbef079b32e8823ec226d4e9e92902706b0eb6",
|
|
37
37
|
release_date="2025-02-19",
|
|
38
38
|
n_parameters=1_780_000_000,
|
|
39
|
+
n_embedding_parameters=232_928_256,
|
|
39
40
|
memory_usage_mb=6776,
|
|
40
41
|
embed_dim=1536,
|
|
41
42
|
license="https://huggingface.co/Qodo/Qodo-Embed-1-1.5B/blob/main/LICENSE",
|
|
@@ -59,6 +60,7 @@ Qodo_Embed_1_7B = ModelMeta(
|
|
|
59
60
|
revision="f9edd9bf7f687c0e832424058e265120f603cd81",
|
|
60
61
|
release_date="2025-02-24",
|
|
61
62
|
n_parameters=7_613_000_000,
|
|
63
|
+
n_embedding_parameters=None,
|
|
62
64
|
memory_usage_mb=29040,
|
|
63
65
|
embed_dim=3584,
|
|
64
66
|
license="https://huggingface.co/Qodo/Qodo-Embed-1-1.5B/blob/main/LICENSE",
|
|
@@ -1,6 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
1
5
|
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
2
6
|
from mteb.models.model_meta import ModelMeta
|
|
3
|
-
from mteb.
|
|
7
|
+
from mteb.types import PromptType
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
4
11
|
|
|
5
12
|
|
|
6
13
|
def instruction_template(
|
|
@@ -140,6 +147,7 @@ Qwen3_Embedding_0B6 = ModelMeta(
|
|
|
140
147
|
revision="b22da495047858cce924d27d76261e96be6febc0", # Commit of @tomaarsen
|
|
141
148
|
release_date="2025-06-05",
|
|
142
149
|
n_parameters=595776512,
|
|
150
|
+
n_embedding_parameters=None,
|
|
143
151
|
memory_usage_mb=1136,
|
|
144
152
|
embed_dim=1024,
|
|
145
153
|
max_tokens=32768,
|
|
@@ -163,6 +171,7 @@ Qwen3_Embedding_4B = ModelMeta(
|
|
|
163
171
|
revision="636cd9bf47d976946cdbb2b0c3ca0cb2f8eea5ff", # Commit of @tomaarsen
|
|
164
172
|
release_date="2025-06-05",
|
|
165
173
|
n_parameters=4021774336,
|
|
174
|
+
n_embedding_parameters=None,
|
|
166
175
|
memory_usage_mb=7671,
|
|
167
176
|
embed_dim=2560,
|
|
168
177
|
max_tokens=32768,
|
|
@@ -186,6 +195,7 @@ Qwen3_Embedding_8B = ModelMeta(
|
|
|
186
195
|
revision="4e423935c619ae4df87b646a3ce949610c66241c", # Commit of @tomaarsen
|
|
187
196
|
release_date="2025-06-05",
|
|
188
197
|
n_parameters=7567295488,
|
|
198
|
+
n_embedding_parameters=None,
|
|
189
199
|
memory_usage_mb=14433,
|
|
190
200
|
embed_dim=4096,
|
|
191
201
|
max_tokens=32768,
|
|
@@ -64,6 +64,7 @@ QZhou_Embedding = ModelMeta(
|
|
|
64
64
|
revision="f1e6c03ee3882e7b9fa5cec91217715272e433b8",
|
|
65
65
|
release_date="2025-08-24",
|
|
66
66
|
n_parameters=7_070_619_136,
|
|
67
|
+
n_embedding_parameters=None,
|
|
67
68
|
memory_usage_mb=14436,
|
|
68
69
|
embed_dim=3584,
|
|
69
70
|
license="apache-2.0",
|
|
@@ -98,6 +99,7 @@ QZhou_Embedding_Zh = ModelMeta(
|
|
|
98
99
|
revision="0321ccb126413d1e49c5ce908e802b63d35f18e2",
|
|
99
100
|
release_date="2025-09-28",
|
|
100
101
|
n_parameters=7_575_747_328,
|
|
102
|
+
n_embedding_parameters=None,
|
|
101
103
|
memory_usage_mb=29431,
|
|
102
104
|
embed_dim=1792,
|
|
103
105
|
license="apache-2.0",
|
|
@@ -5,18 +5,19 @@ from typing import TYPE_CHECKING, Any, Literal
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import torch
|
|
8
|
-
from torch.utils.data import DataLoader
|
|
9
8
|
|
|
10
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
11
9
|
from mteb.models.model_meta import ModelMeta
|
|
12
10
|
from mteb.similarity_functions import (
|
|
13
11
|
select_pairwise_similarity,
|
|
14
12
|
select_similarity,
|
|
15
13
|
)
|
|
16
|
-
from mteb.types._encoder_io import Array, BatchedInput, PromptType
|
|
17
14
|
|
|
18
15
|
if TYPE_CHECKING:
|
|
19
16
|
from PIL import Image
|
|
17
|
+
from torch.utils.data import DataLoader
|
|
18
|
+
|
|
19
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
20
|
+
from mteb.types._encoder_io import Array, BatchedInput, PromptType
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
def _string_to_vector(text: str | None, size: int) -> np.ndarray:
|
|
@@ -36,12 +36,76 @@ REASONIR_TRAINING_DATA = {
|
|
|
36
36
|
"DuRetrieval",
|
|
37
37
|
"QuoraRetrieval",
|
|
38
38
|
}
|
|
39
|
+
_prompts_dict = {
|
|
40
|
+
"BrightBiologyRetrieval": {
|
|
41
|
+
"query": "Given a Biology post, retrieve relevant passages that help answer the post"
|
|
42
|
+
},
|
|
43
|
+
"BrightEarthScienceRetrieval": {
|
|
44
|
+
"query": "Given a Earth Science post, retrieve relevant passages that help answer the post"
|
|
45
|
+
},
|
|
46
|
+
"BrightEconomicsRetrieval": {
|
|
47
|
+
"query": "Given a Economics post, retrieve relevant passages that help answer the post"
|
|
48
|
+
},
|
|
49
|
+
"BrightPsychologyRetrieval": {
|
|
50
|
+
"query": "Given a Psychology post, retrieve relevant passages that help answer the post"
|
|
51
|
+
},
|
|
52
|
+
"BrightRoboticsRetrieval": {
|
|
53
|
+
"query": "Given a Robotics post, retrieve relevant passages that help answer the post"
|
|
54
|
+
},
|
|
55
|
+
"BrightStackoverflowRetrieval": {
|
|
56
|
+
"query": "Given a Stackoverflow post, retrieve relevant passages that help answer the post"
|
|
57
|
+
},
|
|
58
|
+
"BrightSustainableLivingRetrieval": {
|
|
59
|
+
"query": "Given a Sustainable Living post, retrieve relevant passages that help answer the post"
|
|
60
|
+
},
|
|
61
|
+
"BrightPonyRetrieval": {
|
|
62
|
+
"query": "Given a Pony question, retrieve relevant passages that help answer the question"
|
|
63
|
+
},
|
|
64
|
+
"BrightLeetcodeRetrieval": {
|
|
65
|
+
"query": "Given a coding problem, retrieve relevant examples that help answer the problem",
|
|
66
|
+
},
|
|
67
|
+
"BrightAopsRetrieval": {
|
|
68
|
+
"query": "Given a Math problem, retrieve relevant examples that help answer the problem"
|
|
69
|
+
},
|
|
70
|
+
"BrightTheoremQATheoremsRetrieval": {
|
|
71
|
+
"query": "Given a Math problem, retrieve relevant theorems that help answer the problem",
|
|
72
|
+
},
|
|
73
|
+
"BrightTheoremQAQuestionsRetrieval": {
|
|
74
|
+
"query": "Given a Math problem, retrieve relevant examples that help answer the problem",
|
|
75
|
+
},
|
|
76
|
+
"BrightBiologyLongRetrieval": {
|
|
77
|
+
"query": "Given a Biology post, retrieve relevant documents that help answer the post"
|
|
78
|
+
},
|
|
79
|
+
"BrightEarthScienceLongRetrieval": {
|
|
80
|
+
"query": "Given a Earth Science post, retrieve relevant documents that help answer the post"
|
|
81
|
+
},
|
|
82
|
+
"BrightEconomicsLongRetrieval": {
|
|
83
|
+
"query": "Given a Economics post, retrieve relevant documents that help answer the post"
|
|
84
|
+
},
|
|
85
|
+
"BrightPsychologyLongRetrieval": {
|
|
86
|
+
"query": "Given a Psychology post, retrieve relevant documents that help answer the post"
|
|
87
|
+
},
|
|
88
|
+
"BrightRoboticsLongRetrieval": {
|
|
89
|
+
"query": "Given a Robotics post, retrieve relevant documents that help answer the post"
|
|
90
|
+
},
|
|
91
|
+
"BrightStackoverflowLongRetrieval": {
|
|
92
|
+
"query": "Given a Stackoverflow post, retrieve relevant documents that help answer the post"
|
|
93
|
+
},
|
|
94
|
+
"BrightSustainableLivingLongRetrieval": {
|
|
95
|
+
"query": "Given a Sustainable Living post, retrieve relevant documents that help answer the post"
|
|
96
|
+
},
|
|
97
|
+
"BrightPonyLongRetrieval": {
|
|
98
|
+
"query": "Given a Pony question, retrieve relevant documents that help answer the question"
|
|
99
|
+
},
|
|
100
|
+
}
|
|
101
|
+
|
|
39
102
|
|
|
40
103
|
ReasonIR_8B = ModelMeta(
|
|
41
104
|
loader=InstructSentenceTransformerModel,
|
|
42
105
|
loader_kwargs=dict(
|
|
43
106
|
instruction_template=instruction_template,
|
|
44
107
|
trust_remote_code=True,
|
|
108
|
+
prompts_dict=_prompts_dict,
|
|
45
109
|
),
|
|
46
110
|
name="ReasonIR/ReasonIR-8B",
|
|
47
111
|
model_type=["dense"],
|
|
@@ -50,6 +114,7 @@ ReasonIR_8B = ModelMeta(
|
|
|
50
114
|
revision="c3d0690370ff4a8c3d3882d8dfa85c43650034fa",
|
|
51
115
|
release_date="2025-04-29",
|
|
52
116
|
n_parameters=7_500_000_000,
|
|
117
|
+
n_embedding_parameters=None,
|
|
53
118
|
memory_usage_mb=None,
|
|
54
119
|
embed_dim=4096,
|
|
55
120
|
license="cc-by-nc-4.0",
|
|
@@ -1,22 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
7
|
import torch
|
|
7
8
|
import torch.nn.functional as F
|
|
8
|
-
from torch.utils.data import DataLoader
|
|
9
9
|
from tqdm.auto import tqdm
|
|
10
10
|
|
|
11
11
|
from mteb._requires_package import requires_package
|
|
12
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
13
12
|
from mteb.models.abs_encoder import AbsEncoder
|
|
14
13
|
from mteb.models.model_meta import (
|
|
15
14
|
ModelMeta,
|
|
16
15
|
ScoringFunction,
|
|
17
16
|
)
|
|
18
|
-
from mteb.
|
|
19
|
-
|
|
17
|
+
from mteb.types import PromptType
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from collections.abc import Callable
|
|
21
|
+
|
|
22
|
+
from torch.utils.data import DataLoader
|
|
23
|
+
|
|
24
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
25
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
26
|
+
from mteb.types import Array, BatchedInput
|
|
20
27
|
|
|
21
28
|
logger = logging.getLogger(__name__)
|
|
22
29
|
|
|
@@ -172,6 +179,7 @@ repllama_llama2_original = ModelMeta(
|
|
|
172
179
|
"mMARCO-NL", # translation not trained on
|
|
173
180
|
},
|
|
174
181
|
n_parameters=7_000_000,
|
|
182
|
+
n_embedding_parameters=131_072_000,
|
|
175
183
|
memory_usage_mb=27,
|
|
176
184
|
max_tokens=4096,
|
|
177
185
|
embed_dim=4096,
|
|
@@ -201,6 +209,7 @@ repllama_llama2_reproduced = ModelMeta(
|
|
|
201
209
|
revision="01c7f73d771dfac7d292323805ebc428287df4f9-ad5c1d0938a1e02954bcafb4d811ba2f34052e71", # base-peft revision
|
|
202
210
|
release_date="2024-09-15",
|
|
203
211
|
n_parameters=7_000_000,
|
|
212
|
+
n_embedding_parameters=None,
|
|
204
213
|
memory_usage_mb=27,
|
|
205
214
|
max_tokens=4096,
|
|
206
215
|
embed_dim=4096,
|
|
@@ -1,16 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from torch.utils.data import DataLoader
|
|
6
7
|
|
|
7
8
|
from mteb._requires_package import requires_package
|
|
8
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
9
|
from mteb.models.model_meta import ModelMeta
|
|
10
|
-
from mteb.types import Array, BatchedInput, PromptType
|
|
11
10
|
|
|
12
11
|
from .bge_models import bge_m3_training_data
|
|
13
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from torch.utils.data import DataLoader
|
|
15
|
+
|
|
16
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
18
|
+
|
|
19
|
+
|
|
14
20
|
logger = logging.getLogger(__name__)
|
|
15
21
|
|
|
16
22
|
|
|
@@ -225,6 +231,7 @@ monobert_large = ModelMeta(
|
|
|
225
231
|
revision="0a97706f3827389da43b83348d5d18c9d53876fa",
|
|
226
232
|
release_date="2020-05-28",
|
|
227
233
|
n_parameters=None,
|
|
234
|
+
n_embedding_parameters=31_254_528,
|
|
228
235
|
memory_usage_mb=None,
|
|
229
236
|
max_tokens=None,
|
|
230
237
|
embed_dim=None,
|
|
@@ -250,6 +257,7 @@ jina_reranker_multilingual = ModelMeta(
|
|
|
250
257
|
revision="126747772a932960028d9f4dc93bd5d9c4869be4",
|
|
251
258
|
release_date="2024-09-26",
|
|
252
259
|
n_parameters=None,
|
|
260
|
+
n_embedding_parameters=None,
|
|
253
261
|
memory_usage_mb=531,
|
|
254
262
|
max_tokens=None,
|
|
255
263
|
embed_dim=None,
|
|
@@ -313,6 +321,7 @@ bge_reranker_v2_m3 = ModelMeta(
|
|
|
313
321
|
revision="953dc6f6f85a1b2dbfca4c34a2796e7dde08d41e",
|
|
314
322
|
release_date="2024-06-24",
|
|
315
323
|
n_parameters=None,
|
|
324
|
+
n_embedding_parameters=256_002_048,
|
|
316
325
|
memory_usage_mb=2166,
|
|
317
326
|
max_tokens=None,
|
|
318
327
|
embed_dim=None,
|