mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +6 -0
- mteb/_create_dataloaders.py +22 -20
- mteb/_evaluators/any_sts_evaluator.py +23 -14
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +3 -3
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
- mteb/_evaluators/pair_classification_evaluator.py +34 -40
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +25 -37
- mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
- mteb/_evaluators/text/summarization_evaluator.py +27 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +105 -0
- mteb/abstasks/_statistics_calculation.py +23 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -12
- mteb/abstasks/clustering.py +20 -16
- mteb/abstasks/clustering_legacy.py +13 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +33 -22
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +14 -4
- mteb/abstasks/task_metadata.py +32 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +77 -16
- mteb/benchmarks/benchmarks/__init__.py +12 -0
- mteb/benchmarks/benchmarks/benchmarks.py +361 -16
- mteb/benchmarks/get_benchmark.py +14 -53
- mteb/cache.py +227 -37
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +71 -62
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +106 -75
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +414 -151
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/load_results.py +12 -12
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +31 -23
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +3 -3
- mteb/models/get_model_meta.py +25 -118
- mteb/models/instruct_wrapper.py +33 -9
- mteb/models/model_implementations/align_models.py +8 -1
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +9 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +101 -17
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +13 -2
- mteb/models/model_implementations/blip_models.py +43 -16
- mteb/models/model_implementations/bm25.py +5 -4
- mteb/models/model_implementations/bmretriever_models.py +10 -4
- mteb/models/model_implementations/cadet_models.py +10 -1
- mteb/models/model_implementations/cde_models.py +25 -4
- mteb/models/model_implementations/clip_models.py +9 -6
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +165 -3
- mteb/models/model_implementations/codesage_models.py +18 -3
- mteb/models/model_implementations/cohere_models.py +13 -6
- mteb/models/model_implementations/cohere_v.py +7 -2
- mteb/models/model_implementations/colpali_models.py +17 -9
- mteb/models/model_implementations/colqwen_models.py +275 -5
- mteb/models/model_implementations/colsmol_models.py +4 -2
- mteb/models/model_implementations/conan_models.py +2 -1
- mteb/models/model_implementations/dino_models.py +194 -23
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +21 -110
- mteb/models/model_implementations/e5_v.py +7 -6
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +67 -9
- mteb/models/model_implementations/facebookai.py +205 -0
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +17 -10
- mteb/models/model_implementations/google_models.py +17 -6
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
- mteb/models/model_implementations/gritlm_models.py +4 -2
- mteb/models/model_implementations/gte_models.py +99 -9
- mteb/models/model_implementations/hinvec_models.py +2 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +256 -3
- mteb/models/model_implementations/jina_clip.py +49 -10
- mteb/models/model_implementations/jina_models.py +222 -11
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +37 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +4 -3
- mteb/models/model_implementations/listconranker.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +9 -6
- mteb/models/model_implementations/llm2vec_models.py +16 -8
- mteb/models/model_implementations/mcinext_models.py +7 -1
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +422 -60
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +15 -4
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +27 -14
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
- mteb/models/model_implementations/nomic_models.py +173 -6
- mteb/models/model_implementations/nomic_models_vision.py +8 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
- mteb/models/model_implementations/nvidia_models.py +155 -20
- mteb/models/model_implementations/octen_models.py +254 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +37 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
- mteb/models/model_implementations/ops_moa_models.py +5 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +9 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +12 -8
- mteb/models/model_implementations/pylate_models.py +46 -12
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +9 -6
- mteb/models/model_implementations/qzhou_models.py +5 -3
- mteb/models/model_implementations/random_baseline.py +19 -24
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +2 -1
- mteb/models/model_implementations/repllama_models.py +5 -3
- mteb/models/model_implementations/rerankers_custom.py +15 -9
- mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +71 -20
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +6 -3
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +177 -18
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +30 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +11 -1
- mteb/models/model_implementations/uae_models.py +8 -1
- mteb/models/model_implementations/vdr_models.py +3 -1
- mteb/models/model_implementations/vi_vn_models.py +45 -6
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +5 -3
- mteb/models/model_implementations/voyage_models.py +99 -0
- mteb/models/model_implementations/voyage_v.py +17 -9
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +498 -29
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
- mteb/models/search_wrappers.py +197 -65
- mteb/models/sentence_transformer_wrapper.py +52 -32
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +114 -65
- mteb/results/model_result.py +63 -26
- mteb/results/task_result.py +117 -77
- mteb/similarity_functions.py +60 -7
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -3
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +2 -3
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +16 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +24 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +19 -2
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
- mteb/models/model_implementations/mxbai_models.py +0 -102
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import warnings
|
|
3
4
|
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
|
|
7
|
-
from mteb.types import BatchedInput
|
|
8
|
-
|
|
9
9
|
from ._hash_utils import _hash_item
|
|
10
10
|
|
|
11
11
|
logger = logging.getLogger(__name__)
|
|
@@ -14,7 +14,7 @@ logger = logging.getLogger(__name__)
|
|
|
14
14
|
class NumpyCache:
|
|
15
15
|
"""Generic vector cache for both text and images."""
|
|
16
16
|
|
|
17
|
-
def __init__(self, directory: str | Path, initial_vectors: int =
|
|
17
|
+
def __init__(self, directory: str | Path, initial_vectors: int = 100_000):
|
|
18
18
|
self.directory = Path(directory)
|
|
19
19
|
self.directory.mkdir(parents=True, exist_ok=True)
|
|
20
20
|
self.vectors_file = self.directory / "vectors.npy"
|
|
@@ -27,7 +27,7 @@ class NumpyCache:
|
|
|
27
27
|
logger.info(f"Initialized VectorCacheMap in directory: {self.directory}")
|
|
28
28
|
self._initialize_vectors_file()
|
|
29
29
|
|
|
30
|
-
def add(self,
|
|
30
|
+
def add(self, items: list[dict[str, Any]], vectors: np.ndarray) -> None:
|
|
31
31
|
"""Add a vector to the cache."""
|
|
32
32
|
try:
|
|
33
33
|
if self.vector_dim is None:
|
|
@@ -38,12 +38,17 @@ class NumpyCache:
|
|
|
38
38
|
self._save_dimension()
|
|
39
39
|
logger.info(f"Initialized vector dimension to {self.vector_dim}")
|
|
40
40
|
|
|
41
|
-
|
|
41
|
+
if self.vectors is None:
|
|
42
|
+
raise RuntimeError(
|
|
43
|
+
"Vectors file not initialized. Call _initialize_vectors_file() first."
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
for item, vec in zip(items, vectors):
|
|
42
47
|
item_hash = _hash_item(item)
|
|
43
48
|
if item_hash in self.hash_to_index:
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
)
|
|
49
|
+
msg = f"Hash collision or duplicate item for hash {item_hash}. Overwriting existing vector."
|
|
50
|
+
logger.warning(msg)
|
|
51
|
+
warnings.warn(msg)
|
|
47
52
|
index = self.hash_to_index[item_hash]
|
|
48
53
|
else:
|
|
49
54
|
index = len(self.hash_to_index)
|
|
@@ -74,18 +79,26 @@ class NumpyCache:
|
|
|
74
79
|
shape=(self.initial_vectors, self.vector_dim),
|
|
75
80
|
)
|
|
76
81
|
else:
|
|
77
|
-
self.vectors = np.memmap(
|
|
78
|
-
|
|
82
|
+
self.vectors = np.memmap(
|
|
83
|
+
self.vectors_file,
|
|
84
|
+
dtype="float32",
|
|
85
|
+
mode="r+",
|
|
86
|
+
shape=(-1, self.vector_dim),
|
|
87
|
+
)
|
|
79
88
|
logger.info(f"Vectors file initialized with shape: {self.vectors.shape}")
|
|
80
89
|
|
|
81
90
|
def _double_vectors_file(self) -> None:
|
|
91
|
+
if self.vectors is None or self.vector_dim is None:
|
|
92
|
+
raise RuntimeError(
|
|
93
|
+
"Vectors file not initialized. Call _initialize_vectors_file() first."
|
|
94
|
+
)
|
|
82
95
|
current_size = len(self.vectors)
|
|
83
96
|
new_size = current_size * 2
|
|
84
97
|
logger.info(f"Doubling vectors file from {current_size} to {new_size} vectors")
|
|
85
98
|
self.vectors.flush()
|
|
86
99
|
new_vectors = np.memmap(
|
|
87
|
-
self.vectors_file,
|
|
88
|
-
dtype=
|
|
100
|
+
str(self.vectors_file),
|
|
101
|
+
dtype=np.float32,
|
|
89
102
|
mode="r+",
|
|
90
103
|
shape=(new_size, self.vector_dim),
|
|
91
104
|
)
|
|
@@ -107,9 +120,9 @@ class NumpyCache:
|
|
|
107
120
|
f"Loaded vector dimension {self.vector_dim} from {self.dimension_file}"
|
|
108
121
|
)
|
|
109
122
|
else:
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
)
|
|
123
|
+
msg = "Dimension file not found. Vector dimension remains uninitialized."
|
|
124
|
+
logger.warning(msg)
|
|
125
|
+
warnings.warn(msg)
|
|
113
126
|
|
|
114
127
|
def save(self) -> None:
|
|
115
128
|
"""Persist VectorCacheMap to disk."""
|
|
@@ -146,25 +159,30 @@ class NumpyCache:
|
|
|
146
159
|
|
|
147
160
|
if self.vector_dim is not None:
|
|
148
161
|
self.vectors = np.memmap(
|
|
149
|
-
self.vectors_file,
|
|
162
|
+
self.vectors_file,
|
|
163
|
+
dtype="float32",
|
|
164
|
+
mode="r+",
|
|
165
|
+
shape=(-1, self.vector_dim),
|
|
150
166
|
)
|
|
151
|
-
self.vectors = self.vectors.reshape(-1, self.vector_dim)
|
|
152
167
|
logger.info(f"Loaded vectors file with shape: {self.vectors.shape}")
|
|
153
168
|
else:
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
)
|
|
169
|
+
msg = "Vector dimension not set. Unable to load vectors file."
|
|
170
|
+
logger.warning(msg)
|
|
171
|
+
warnings.warn(msg)
|
|
157
172
|
logger.info(f"Loaded VectorCacheMap from {self.directory}")
|
|
158
173
|
else:
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
)
|
|
174
|
+
msg = "No existing files found. Initialized empty VectorCacheMap."
|
|
175
|
+
logger.warning(msg)
|
|
176
|
+
warnings.warn(msg)
|
|
162
177
|
except Exception as e:
|
|
163
178
|
logger.error(f"Error loading VectorCacheMap: {str(e)}")
|
|
164
179
|
raise
|
|
165
180
|
|
|
166
|
-
def get_vector(self, item:
|
|
181
|
+
def get_vector(self, item: dict[str, Any]) -> np.ndarray | None:
|
|
167
182
|
"""Retrieve vector from index by hash."""
|
|
183
|
+
if self.vectors is None:
|
|
184
|
+
return None
|
|
185
|
+
|
|
168
186
|
try:
|
|
169
187
|
item_hash = _hash_item(item)
|
|
170
188
|
if item_hash not in self.hash_to_index:
|
|
@@ -176,7 +194,7 @@ class NumpyCache:
|
|
|
176
194
|
logger.error(f"Error retrieving vector for item: {str(e)}")
|
|
177
195
|
raise
|
|
178
196
|
|
|
179
|
-
def __contains__(self, item:
|
|
197
|
+
def __contains__(self, item: dict[str, Any]) -> bool:
|
|
180
198
|
return _hash_item(item) in self.hash_to_index
|
|
181
199
|
|
|
182
200
|
def __del__(self):
|
|
@@ -90,9 +90,9 @@ class CachedEmbeddingWrapper:
|
|
|
90
90
|
try:
|
|
91
91
|
cache = self._get_or_create_cache(task_name)
|
|
92
92
|
|
|
93
|
-
uncached_items: list[
|
|
93
|
+
uncached_items: list[dict[str, Any]] = []
|
|
94
94
|
uncached_indices: list[int] = []
|
|
95
|
-
all_items = inputs.dataset
|
|
95
|
+
all_items: Dataset = inputs.dataset
|
|
96
96
|
cached_vectors: dict[int, np.ndarray] = {}
|
|
97
97
|
|
|
98
98
|
for i, item in enumerate(all_items):
|
|
@@ -112,7 +112,7 @@ class CachedEmbeddingWrapper:
|
|
|
112
112
|
dataset,
|
|
113
113
|
task_metadata=task_metadata,
|
|
114
114
|
prompt_type=prompt_type,
|
|
115
|
-
|
|
115
|
+
**kwargs,
|
|
116
116
|
)
|
|
117
117
|
new_vectors = self._model.encode(
|
|
118
118
|
dl,
|
mteb/models/get_model_meta.py
CHANGED
|
@@ -1,25 +1,15 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
import difflib
|
|
4
2
|
import logging
|
|
5
3
|
from collections.abc import Iterable
|
|
6
|
-
from typing import
|
|
7
|
-
|
|
8
|
-
from huggingface_hub import ModelCard
|
|
9
|
-
from huggingface_hub.errors import RepositoryNotFoundError
|
|
4
|
+
from typing import Any
|
|
10
5
|
|
|
11
6
|
from mteb.abstasks import AbsTask
|
|
12
7
|
from mteb.models import (
|
|
13
|
-
CrossEncoderWrapper,
|
|
14
8
|
ModelMeta,
|
|
15
9
|
MTEBModels,
|
|
16
|
-
sentence_transformers_loader,
|
|
17
10
|
)
|
|
18
11
|
from mteb.models.model_implementations import MODEL_REGISTRY
|
|
19
12
|
|
|
20
|
-
if TYPE_CHECKING:
|
|
21
|
-
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
22
|
-
|
|
23
13
|
logger = logging.getLogger(__name__)
|
|
24
14
|
|
|
25
15
|
|
|
@@ -31,6 +21,7 @@ def get_model_metas(
|
|
|
31
21
|
n_parameters_range: tuple[int | None, int | None] = (None, None),
|
|
32
22
|
use_instructions: bool | None = None,
|
|
33
23
|
zero_shot_on: list[AbsTask] | None = None,
|
|
24
|
+
model_types: Iterable[str] | None = None,
|
|
34
25
|
) -> list[ModelMeta]:
|
|
35
26
|
"""Load all models' metadata that fit the specified criteria.
|
|
36
27
|
|
|
@@ -43,6 +34,7 @@ def get_model_metas(
|
|
|
43
34
|
If (None, None), this filter is ignored.
|
|
44
35
|
use_instructions: Whether to filter by models that use instructions. If None, all models are included.
|
|
45
36
|
zero_shot_on: A list of tasks on which the model is zero-shot. If None this filter is ignored.
|
|
37
|
+
model_types: A list of model types to filter by. If None, all model types are included.
|
|
46
38
|
|
|
47
39
|
Returns:
|
|
48
40
|
A list of model metadata objects that fit the specified criteria.
|
|
@@ -51,6 +43,7 @@ def get_model_metas(
|
|
|
51
43
|
model_names = set(model_names) if model_names is not None else None
|
|
52
44
|
languages = set(languages) if languages is not None else None
|
|
53
45
|
frameworks = set(frameworks) if frameworks is not None else None
|
|
46
|
+
model_types_set = set(model_types) if model_types is not None else None
|
|
54
47
|
for model_meta in MODEL_REGISTRY.values():
|
|
55
48
|
if (model_names is not None) and (model_meta.name not in model_names):
|
|
56
49
|
continue
|
|
@@ -67,6 +60,10 @@ def get_model_metas(
|
|
|
67
60
|
model_meta.use_instructions != use_instructions
|
|
68
61
|
):
|
|
69
62
|
continue
|
|
63
|
+
if model_types_set is not None and not model_types_set.intersection(
|
|
64
|
+
model_meta.model_type
|
|
65
|
+
):
|
|
66
|
+
continue
|
|
70
67
|
|
|
71
68
|
lower, upper = n_parameters_range
|
|
72
69
|
n_parameters = model_meta.n_parameters
|
|
@@ -85,7 +82,10 @@ def get_model_metas(
|
|
|
85
82
|
|
|
86
83
|
|
|
87
84
|
def get_model(
|
|
88
|
-
model_name: str,
|
|
85
|
+
model_name: str,
|
|
86
|
+
revision: str | None = None,
|
|
87
|
+
device: str | None = None,
|
|
88
|
+
**kwargs: Any,
|
|
89
89
|
) -> MTEBModels:
|
|
90
90
|
"""A function to fetch and load model object by name.
|
|
91
91
|
|
|
@@ -95,30 +95,23 @@ def get_model(
|
|
|
95
95
|
Args:
|
|
96
96
|
model_name: Name of the model to fetch
|
|
97
97
|
revision: Revision of the model to fetch
|
|
98
|
+
device: Device used to load the model
|
|
98
99
|
**kwargs: Additional keyword arguments to pass to the model loader
|
|
99
100
|
|
|
100
101
|
Returns:
|
|
101
102
|
A model object
|
|
102
103
|
"""
|
|
103
|
-
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
104
|
-
|
|
105
104
|
meta = get_model_meta(model_name, revision)
|
|
106
|
-
model = meta.load_model(**kwargs)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
elif isinstance(model, CrossEncoder):
|
|
117
|
-
_meta = _model_meta_from_cross_encoder(model.model)
|
|
118
|
-
if meta.revision is None:
|
|
119
|
-
meta.revision = _meta.revision if _meta.revision else meta.revision
|
|
120
|
-
|
|
121
|
-
model.mteb_model_meta = meta # type: ignore
|
|
105
|
+
model = meta.load_model(device=device, **kwargs)
|
|
106
|
+
|
|
107
|
+
if kwargs:
|
|
108
|
+
logger.info(
|
|
109
|
+
f"Model '{model_name}' loaded with additional arguments: {list(kwargs.keys())}"
|
|
110
|
+
)
|
|
111
|
+
meta = meta.model_copy(deep=True)
|
|
112
|
+
meta.loader_kwargs |= kwargs
|
|
113
|
+
|
|
114
|
+
model.mteb_model_meta = meta # type: ignore[misc]
|
|
122
115
|
return model
|
|
123
116
|
|
|
124
117
|
|
|
@@ -147,12 +140,8 @@ def get_model_meta(
|
|
|
147
140
|
logger.info(
|
|
148
141
|
"Model not found in model registry. Attempting to extract metadata by loading the model ({model_name}) using HuggingFace."
|
|
149
142
|
)
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
meta.revision = revision
|
|
153
|
-
return meta
|
|
154
|
-
except RepositoryNotFoundError:
|
|
155
|
-
pass
|
|
143
|
+
meta = ModelMeta.from_hub(model_name, revision)
|
|
144
|
+
return meta
|
|
156
145
|
|
|
157
146
|
not_found_msg = f"Model '{model_name}' not found in MTEB registry"
|
|
158
147
|
not_found_msg += " nor on the Huggingface Hub." if fetch_from_hf else "."
|
|
@@ -170,85 +159,3 @@ def get_model_meta(
|
|
|
170
159
|
suggestion = f" Did you mean: '{close_matches[0]}'?"
|
|
171
160
|
|
|
172
161
|
raise KeyError(not_found_msg + suggestion)
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
def _model_meta_from_hf_hub(model_name: str) -> ModelMeta:
|
|
176
|
-
card = ModelCard.load(model_name)
|
|
177
|
-
card_data = card.data.to_dict()
|
|
178
|
-
frameworks = ["PyTorch"]
|
|
179
|
-
loader = None
|
|
180
|
-
if card_data.get("library_name", None) == "sentence-transformers":
|
|
181
|
-
frameworks.append("Sentence Transformers")
|
|
182
|
-
loader = sentence_transformers_loader
|
|
183
|
-
revision = card_data.get("base_model_revision", None)
|
|
184
|
-
license = card_data.get("license", None)
|
|
185
|
-
return ModelMeta(
|
|
186
|
-
loader=loader,
|
|
187
|
-
name=model_name,
|
|
188
|
-
revision=revision,
|
|
189
|
-
release_date=None,
|
|
190
|
-
languages=None,
|
|
191
|
-
license=license,
|
|
192
|
-
framework=frameworks, # type: ignore
|
|
193
|
-
training_datasets=None,
|
|
194
|
-
similarity_fn_name=None,
|
|
195
|
-
n_parameters=None,
|
|
196
|
-
memory_usage_mb=None,
|
|
197
|
-
max_tokens=None,
|
|
198
|
-
embed_dim=None,
|
|
199
|
-
open_weights=True,
|
|
200
|
-
public_training_code=None,
|
|
201
|
-
public_training_data=None,
|
|
202
|
-
use_instructions=None,
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
def _model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta:
|
|
207
|
-
return ModelMeta(
|
|
208
|
-
loader=CrossEncoderWrapper,
|
|
209
|
-
name=model.model.name_or_path,
|
|
210
|
-
revision=model.config._commit_hash,
|
|
211
|
-
release_date=None,
|
|
212
|
-
languages=None,
|
|
213
|
-
framework=["Sentence Transformers"],
|
|
214
|
-
similarity_fn_name=None,
|
|
215
|
-
n_parameters=None,
|
|
216
|
-
memory_usage_mb=None,
|
|
217
|
-
max_tokens=None,
|
|
218
|
-
embed_dim=None,
|
|
219
|
-
license=None,
|
|
220
|
-
open_weights=True,
|
|
221
|
-
public_training_code=None,
|
|
222
|
-
public_training_data=None,
|
|
223
|
-
use_instructions=None,
|
|
224
|
-
training_datasets=None,
|
|
225
|
-
)
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
def _model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMeta:
|
|
229
|
-
name: str | None = (
|
|
230
|
-
model.model_card_data.model_name
|
|
231
|
-
if model.model_card_data.model_name
|
|
232
|
-
else model.model_card_data.base_model
|
|
233
|
-
)
|
|
234
|
-
embeddings_dim = model.get_sentence_embedding_dimension()
|
|
235
|
-
meta = ModelMeta(
|
|
236
|
-
loader=sentence_transformers_loader,
|
|
237
|
-
name=name,
|
|
238
|
-
revision=model.model_card_data.base_model_revision,
|
|
239
|
-
release_date=None,
|
|
240
|
-
languages=None,
|
|
241
|
-
framework=["Sentence Transformers"],
|
|
242
|
-
similarity_fn_name=None,
|
|
243
|
-
n_parameters=None,
|
|
244
|
-
memory_usage_mb=None,
|
|
245
|
-
max_tokens=None,
|
|
246
|
-
embed_dim=embeddings_dim,
|
|
247
|
-
license=None,
|
|
248
|
-
open_weights=True,
|
|
249
|
-
public_training_code=None,
|
|
250
|
-
public_training_data=None,
|
|
251
|
-
use_instructions=None,
|
|
252
|
-
training_datasets=None,
|
|
253
|
-
)
|
|
254
|
-
return meta
|
mteb/models/instruct_wrapper.py
CHANGED
|
@@ -17,7 +17,8 @@ logger = logging.getLogger(__name__)
|
|
|
17
17
|
def instruct_wrapper(
|
|
18
18
|
model_name_or_path: str,
|
|
19
19
|
mode: str,
|
|
20
|
-
instruction_template: str | Callable[[str], str] | None = None,
|
|
20
|
+
instruction_template: str | Callable[[str, PromptType | None], str] | None = None,
|
|
21
|
+
device: str | None = None,
|
|
21
22
|
**kwargs,
|
|
22
23
|
):
|
|
23
24
|
"""Instruct wrapper for models. Uses GritLM to pass instructions to the model.
|
|
@@ -28,6 +29,7 @@ def instruct_wrapper(
|
|
|
28
29
|
model_name_or_path: Model name or path.
|
|
29
30
|
mode: Mode of the model. Either 'query' or 'passage'.
|
|
30
31
|
instruction_template: Instruction template. Should contain the string '{instruction}'.
|
|
32
|
+
device: Device used to load the model.
|
|
31
33
|
**kwargs: Additional arguments to pass to the model.
|
|
32
34
|
"""
|
|
33
35
|
requires_package(
|
|
@@ -40,7 +42,10 @@ def instruct_wrapper(
|
|
|
40
42
|
self,
|
|
41
43
|
model_name_or_path: str,
|
|
42
44
|
mode: str,
|
|
43
|
-
|
|
45
|
+
device: str | None = None,
|
|
46
|
+
instruction_template: str
|
|
47
|
+
| Callable[[str, PromptType | None], str]
|
|
48
|
+
| None = None,
|
|
44
49
|
**kwargs,
|
|
45
50
|
):
|
|
46
51
|
if (
|
|
@@ -61,7 +66,12 @@ def instruct_wrapper(
|
|
|
61
66
|
)
|
|
62
67
|
|
|
63
68
|
self.instruction_template = instruction_template
|
|
64
|
-
super().__init__(
|
|
69
|
+
super().__init__(
|
|
70
|
+
model_name_or_path=model_name_or_path,
|
|
71
|
+
mode=mode,
|
|
72
|
+
device=device,
|
|
73
|
+
**kwargs,
|
|
74
|
+
)
|
|
65
75
|
|
|
66
76
|
def encode(
|
|
67
77
|
self,
|
|
@@ -82,15 +92,20 @@ def instruct_wrapper(
|
|
|
82
92
|
logger.info(
|
|
83
93
|
f"Using instruction: '{instruction}' for task: '{task_metadata.name}'"
|
|
84
94
|
)
|
|
85
|
-
embeddings = super().encode(
|
|
86
|
-
_inputs,
|
|
95
|
+
embeddings = super().encode( # type: ignore[safe-super,call-arg]
|
|
96
|
+
_inputs, # type: ignore[arg-type]
|
|
97
|
+
instruction=instruction,
|
|
98
|
+
*args,
|
|
99
|
+
**kwargs,
|
|
87
100
|
)
|
|
88
101
|
if isinstance(embeddings, torch.Tensor):
|
|
89
102
|
# sometimes in kwargs can be return_tensors=True
|
|
90
103
|
embeddings = embeddings.cpu().detach().float().numpy()
|
|
91
104
|
return embeddings
|
|
92
105
|
|
|
93
|
-
return InstructGritLMModel(
|
|
106
|
+
return InstructGritLMModel(
|
|
107
|
+
model_name_or_path, mode, instruction_template=instruction_template, **kwargs
|
|
108
|
+
)
|
|
94
109
|
|
|
95
110
|
|
|
96
111
|
class InstructSentenceTransformerModel(AbsEncoder):
|
|
@@ -100,6 +115,7 @@ class InstructSentenceTransformerModel(AbsEncoder):
|
|
|
100
115
|
self,
|
|
101
116
|
model_name: str,
|
|
102
117
|
revision: str,
|
|
118
|
+
device: str | None = None,
|
|
103
119
|
instruction_template: str
|
|
104
120
|
| Callable[[str, PromptType | None], str]
|
|
105
121
|
| None = None,
|
|
@@ -117,12 +133,14 @@ class InstructSentenceTransformerModel(AbsEncoder):
|
|
|
117
133
|
Arguments:
|
|
118
134
|
model_name: Model name of the sentence transformers model.
|
|
119
135
|
revision: Revision of the sentence transformers model.
|
|
136
|
+
device: Device used to load the model.
|
|
120
137
|
instruction_template: Model template. Should contain the string '{instruction}'.
|
|
121
138
|
max_seq_length: Maximum sequence length. If None, the maximum sequence length will be read from the model config.
|
|
122
139
|
apply_instruction_to_passages: Whether to apply the instruction template to the passages.
|
|
123
140
|
padding_side: Padding side. If None, the padding side will be read from the model config.
|
|
124
141
|
add_eos_token: Whether to add the eos token to each input example.
|
|
125
|
-
prompts_dict: Dictionary of task names to prompt names. If
|
|
142
|
+
prompts_dict: Dictionary of task names to prompt names. If task name is missing in the dict or prompts dict is None, prompt from task metadata or
|
|
143
|
+
AbsTask.abstask_prompt will be used.
|
|
126
144
|
**kwargs: Kwargs for Sentence Transformer model.
|
|
127
145
|
"""
|
|
128
146
|
from sentence_transformers import SentenceTransformer
|
|
@@ -140,7 +158,7 @@ class InstructSentenceTransformerModel(AbsEncoder):
|
|
|
140
158
|
)
|
|
141
159
|
|
|
142
160
|
self.instruction_template = instruction_template
|
|
143
|
-
tokenizer_params = {}
|
|
161
|
+
tokenizer_params: dict[str, Any] = {}
|
|
144
162
|
if add_eos_token:
|
|
145
163
|
tokenizer_params["add_eos_token"] = add_eos_token
|
|
146
164
|
if max_seq_length is not None:
|
|
@@ -152,7 +170,12 @@ class InstructSentenceTransformerModel(AbsEncoder):
|
|
|
152
170
|
kwargs.setdefault("tokenizer_kwargs", {}).update(tokenizer_params)
|
|
153
171
|
|
|
154
172
|
self.model_name = model_name
|
|
155
|
-
self.model = SentenceTransformer(
|
|
173
|
+
self.model = SentenceTransformer(
|
|
174
|
+
model_name, revision=revision, device=device, **kwargs
|
|
175
|
+
)
|
|
176
|
+
if max_seq_length:
|
|
177
|
+
# https://github.com/huggingface/sentence-transformers/issues/3575
|
|
178
|
+
self.model.max_seq_length = max_seq_length
|
|
156
179
|
self.apply_instruction_to_passages = apply_instruction_to_passages
|
|
157
180
|
self.prompts_dict = prompts_dict
|
|
158
181
|
|
|
@@ -189,6 +212,7 @@ class InstructSentenceTransformerModel(AbsEncoder):
|
|
|
189
212
|
The encoded input in a numpy array or torch tensor of the shape (Number of sentences) x (Embedding dimension).
|
|
190
213
|
"""
|
|
191
214
|
sentences = [text for batch in inputs for text in batch["text"]]
|
|
215
|
+
instruction: str | None
|
|
192
216
|
instruction = self.get_task_instruction(task_metadata, prompt_type)
|
|
193
217
|
|
|
194
218
|
# to passage prompts won't be applied to passages
|
|
@@ -105,6 +105,7 @@ class ALIGNModel(AbsEncoder):
|
|
|
105
105
|
align_base = ModelMeta(
|
|
106
106
|
loader=ALIGNModel,
|
|
107
107
|
name="kakaobrain/align-base",
|
|
108
|
+
model_type=["dense"],
|
|
108
109
|
languages=["eng-Latn"],
|
|
109
110
|
revision="e96a37facc7b1f59090ece82293226b817afd6ba",
|
|
110
111
|
release_date="2023-02-24",
|
|
@@ -117,11 +118,17 @@ align_base = ModelMeta(
|
|
|
117
118
|
open_weights=True,
|
|
118
119
|
public_training_code="https://github.com/kakaobrain/coyo-align",
|
|
119
120
|
public_training_data=True,
|
|
120
|
-
framework=["PyTorch"],
|
|
121
|
+
framework=["PyTorch", "Transformers"],
|
|
121
122
|
reference="https://huggingface.co/kakaobrain/align-base",
|
|
122
123
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
123
124
|
use_instructions=False,
|
|
124
125
|
training_datasets=set(
|
|
125
126
|
# COYO-700M
|
|
126
127
|
),
|
|
128
|
+
citation="""@misc{kakaobrain2022coyo-align,
|
|
129
|
+
title = {COYO-ALIGN},
|
|
130
|
+
author = {Yoon, Boogeo and Lee, Youhan and Baek, Woonhyuk},
|
|
131
|
+
year = {2022},
|
|
132
|
+
howpublished = {https://github.com/kakaobrain/coyo-align},
|
|
133
|
+
}""",
|
|
127
134
|
)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from mteb.models.model_implementations.model2vec_models import Model2VecModel
|
|
4
|
+
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
5
|
+
|
|
6
|
+
model2vecdk = ModelMeta(
|
|
7
|
+
loader=Model2VecModel,
|
|
8
|
+
name="andersborges/model2vecdk",
|
|
9
|
+
model_type=["dense"],
|
|
10
|
+
languages=["dan-Latn"],
|
|
11
|
+
open_weights=True,
|
|
12
|
+
revision="cb576c78dcc1b729e4612645f61db59929d69e61",
|
|
13
|
+
release_date="2025-11-21",
|
|
14
|
+
n_parameters=48042496,
|
|
15
|
+
memory_usage_mb=183,
|
|
16
|
+
max_tokens=np.inf,
|
|
17
|
+
embed_dim=256,
|
|
18
|
+
license="mit",
|
|
19
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
20
|
+
framework=["NumPy", "Sentence Transformers", "safetensors"],
|
|
21
|
+
reference="https://huggingface.co/andersborges/model2vecdk",
|
|
22
|
+
use_instructions=False,
|
|
23
|
+
adapted_from="https://huggingface.co/jealk/TTC-L2V-supervised-2",
|
|
24
|
+
superseded_by=None,
|
|
25
|
+
training_datasets=set(), # distilled
|
|
26
|
+
public_training_code="https://github.com/andersborges/dkmodel2vec",
|
|
27
|
+
public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
|
|
28
|
+
citation="""@article{minishlab2024model2vec,
|
|
29
|
+
author = {Tulkens, Stephan and {van Dongen}, Thomas},
|
|
30
|
+
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
|
|
31
|
+
year = {2024},
|
|
32
|
+
url = {https://github.com/MinishLab/model2vec}
|
|
33
|
+
}""",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
model2vecdk_stem = ModelMeta(
|
|
38
|
+
loader=Model2VecModel,
|
|
39
|
+
name="andersborges/model2vecdk-stem",
|
|
40
|
+
model_type=["dense"],
|
|
41
|
+
languages=["dan-Latn"],
|
|
42
|
+
open_weights=True,
|
|
43
|
+
revision="cb576c78dcc1b729e4612645f61db59929d69e61",
|
|
44
|
+
release_date="2025-11-21",
|
|
45
|
+
n_parameters=48578560,
|
|
46
|
+
memory_usage_mb=185,
|
|
47
|
+
max_tokens=np.inf,
|
|
48
|
+
embed_dim=256,
|
|
49
|
+
license="mit",
|
|
50
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
51
|
+
framework=["NumPy", "Sentence Transformers", "safetensors"],
|
|
52
|
+
reference="https://huggingface.co/andersborges/model2vecdk",
|
|
53
|
+
use_instructions=False,
|
|
54
|
+
adapted_from="https://huggingface.co/jealk/TTC-L2V-supervised-2",
|
|
55
|
+
superseded_by=None,
|
|
56
|
+
training_datasets=set(), # distilled
|
|
57
|
+
public_training_code="https://github.com/andersborges/dkmodel2vec",
|
|
58
|
+
public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
|
|
59
|
+
citation="""@article{minishlab2024model2vec,
|
|
60
|
+
author = {Tulkens, Stephan and {van Dongen}, Thomas},
|
|
61
|
+
title = {Model2Vec: Fast State-of-the-Art Static Embeddings},
|
|
62
|
+
year = {2024},
|
|
63
|
+
url = {https://github.com/MinishLab/model2vec}
|
|
64
|
+
}""",
|
|
65
|
+
)
|
|
@@ -4,6 +4,7 @@ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loade
|
|
|
4
4
|
arabic_triplet_matryoshka = ModelMeta(
|
|
5
5
|
loader=sentence_transformers_loader,
|
|
6
6
|
name="Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2",
|
|
7
|
+
model_type=["dense"],
|
|
7
8
|
languages=["ara-Arab"],
|
|
8
9
|
open_weights=True,
|
|
9
10
|
revision="ed357f222f0b6ea6670d2c9b5a1cb93950d34200",
|
|
@@ -15,7 +16,7 @@ arabic_triplet_matryoshka = ModelMeta(
|
|
|
15
16
|
max_tokens=768,
|
|
16
17
|
reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2",
|
|
17
18
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
18
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
19
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
19
20
|
use_instructions=False,
|
|
20
21
|
public_training_code=None,
|
|
21
22
|
adapted_from="aubmindlab/bert-base-arabertv02",
|
|
@@ -23,4 +24,11 @@ arabic_triplet_matryoshka = ModelMeta(
|
|
|
23
24
|
training_datasets=set(
|
|
24
25
|
# "akhooli/arabic-triplets-1m-curated-sims-len"
|
|
25
26
|
),
|
|
27
|
+
citation="""
|
|
28
|
+
@article{nacar2025gate,
|
|
29
|
+
title={GATE: General Arabic Text Embedding for Enhanced Semantic Textual Similarity with Matryoshka Representation Learning and Hybrid Loss Training},
|
|
30
|
+
author={Nacar, Omer and Koubaa, Anis and Sibaee, Serry and Al-Habashi, Yasser and Ammar, Adel and Boulila, Wadii},
|
|
31
|
+
journal={arXiv preprint arXiv:2505.24581},
|
|
32
|
+
year={2025}
|
|
33
|
+
}""",
|
|
26
34
|
)
|