mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +6 -0
- mteb/_create_dataloaders.py +22 -20
- mteb/_evaluators/any_sts_evaluator.py +23 -14
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +3 -3
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
- mteb/_evaluators/pair_classification_evaluator.py +34 -40
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +25 -37
- mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
- mteb/_evaluators/text/summarization_evaluator.py +27 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +105 -0
- mteb/abstasks/_statistics_calculation.py +23 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -12
- mteb/abstasks/clustering.py +20 -16
- mteb/abstasks/clustering_legacy.py +13 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +33 -22
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +14 -4
- mteb/abstasks/task_metadata.py +32 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +77 -16
- mteb/benchmarks/benchmarks/__init__.py +12 -0
- mteb/benchmarks/benchmarks/benchmarks.py +361 -16
- mteb/benchmarks/get_benchmark.py +14 -53
- mteb/cache.py +227 -37
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +71 -62
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +106 -75
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +414 -151
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/load_results.py +12 -12
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +31 -23
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +3 -3
- mteb/models/get_model_meta.py +25 -118
- mteb/models/instruct_wrapper.py +33 -9
- mteb/models/model_implementations/align_models.py +8 -1
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +9 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +101 -17
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +13 -2
- mteb/models/model_implementations/blip_models.py +43 -16
- mteb/models/model_implementations/bm25.py +5 -4
- mteb/models/model_implementations/bmretriever_models.py +10 -4
- mteb/models/model_implementations/cadet_models.py +10 -1
- mteb/models/model_implementations/cde_models.py +25 -4
- mteb/models/model_implementations/clip_models.py +9 -6
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +165 -3
- mteb/models/model_implementations/codesage_models.py +18 -3
- mteb/models/model_implementations/cohere_models.py +13 -6
- mteb/models/model_implementations/cohere_v.py +7 -2
- mteb/models/model_implementations/colpali_models.py +17 -9
- mteb/models/model_implementations/colqwen_models.py +275 -5
- mteb/models/model_implementations/colsmol_models.py +4 -2
- mteb/models/model_implementations/conan_models.py +2 -1
- mteb/models/model_implementations/dino_models.py +194 -23
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +21 -110
- mteb/models/model_implementations/e5_v.py +7 -6
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +67 -9
- mteb/models/model_implementations/facebookai.py +205 -0
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +17 -10
- mteb/models/model_implementations/google_models.py +17 -6
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
- mteb/models/model_implementations/gritlm_models.py +4 -2
- mteb/models/model_implementations/gte_models.py +99 -9
- mteb/models/model_implementations/hinvec_models.py +2 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +256 -3
- mteb/models/model_implementations/jina_clip.py +49 -10
- mteb/models/model_implementations/jina_models.py +222 -11
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +37 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +4 -3
- mteb/models/model_implementations/listconranker.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +9 -6
- mteb/models/model_implementations/llm2vec_models.py +16 -8
- mteb/models/model_implementations/mcinext_models.py +7 -1
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +422 -60
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +15 -4
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +27 -14
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
- mteb/models/model_implementations/nomic_models.py +173 -6
- mteb/models/model_implementations/nomic_models_vision.py +8 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
- mteb/models/model_implementations/nvidia_models.py +155 -20
- mteb/models/model_implementations/octen_models.py +254 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +37 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
- mteb/models/model_implementations/ops_moa_models.py +5 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +9 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +12 -8
- mteb/models/model_implementations/pylate_models.py +46 -12
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +9 -6
- mteb/models/model_implementations/qzhou_models.py +5 -3
- mteb/models/model_implementations/random_baseline.py +19 -24
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +2 -1
- mteb/models/model_implementations/repllama_models.py +5 -3
- mteb/models/model_implementations/rerankers_custom.py +15 -9
- mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +71 -20
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +6 -3
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +177 -18
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +30 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +11 -1
- mteb/models/model_implementations/uae_models.py +8 -1
- mteb/models/model_implementations/vdr_models.py +3 -1
- mteb/models/model_implementations/vi_vn_models.py +45 -6
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +5 -3
- mteb/models/model_implementations/voyage_models.py +99 -0
- mteb/models/model_implementations/voyage_v.py +17 -9
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +498 -29
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
- mteb/models/search_wrappers.py +197 -65
- mteb/models/sentence_transformer_wrapper.py +52 -32
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +114 -65
- mteb/results/model_result.py +63 -26
- mteb/results/task_result.py +117 -77
- mteb/similarity_functions.py +60 -7
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -3
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +2 -3
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +16 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +24 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +19 -2
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
- mteb/models/model_implementations/mxbai_models.py +0 -102
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
mteb/models/model_meta.py
CHANGED
|
@@ -1,25 +1,48 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
1
4
|
import logging
|
|
5
|
+
import warnings
|
|
2
6
|
from collections.abc import Callable, Sequence
|
|
3
7
|
from dataclasses import field
|
|
4
8
|
from enum import Enum
|
|
9
|
+
from functools import partial
|
|
10
|
+
from pathlib import Path
|
|
5
11
|
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
6
12
|
|
|
7
|
-
from huggingface_hub import
|
|
13
|
+
from huggingface_hub import (
|
|
14
|
+
GitCommitInfo,
|
|
15
|
+
ModelCard,
|
|
16
|
+
ModelCardData,
|
|
17
|
+
get_safetensors_metadata,
|
|
18
|
+
hf_hub_download,
|
|
19
|
+
list_repo_commits,
|
|
20
|
+
model_info,
|
|
21
|
+
repo_exists,
|
|
22
|
+
)
|
|
8
23
|
from huggingface_hub.errors import (
|
|
24
|
+
EntryNotFoundError,
|
|
9
25
|
GatedRepoError,
|
|
26
|
+
HFValidationError,
|
|
10
27
|
NotASafetensorsRepoError,
|
|
28
|
+
RepositoryNotFoundError,
|
|
11
29
|
SafetensorsParsingError,
|
|
12
30
|
)
|
|
13
|
-
from pydantic import BaseModel, ConfigDict, field_validator
|
|
31
|
+
from pydantic import BaseModel, ConfigDict, field_validator, model_validator
|
|
32
|
+
from transformers import AutoConfig
|
|
33
|
+
from typing_extensions import Self
|
|
14
34
|
|
|
35
|
+
from mteb._helpful_enum import HelpfulStrEnum
|
|
15
36
|
from mteb.languages import check_language_code
|
|
37
|
+
from mteb.models.models_protocols import EncoderProtocol, MTEBModels
|
|
16
38
|
from mteb.types import ISOLanguageScript, Licenses, Modalities, StrDate, StrURL
|
|
17
39
|
|
|
18
|
-
from .models_protocols import EncoderProtocol, MTEBModels
|
|
19
|
-
|
|
20
40
|
if TYPE_CHECKING:
|
|
41
|
+
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
42
|
+
|
|
21
43
|
from mteb.abstasks import AbsTask
|
|
22
44
|
|
|
45
|
+
|
|
23
46
|
logger = logging.getLogger(__name__)
|
|
24
47
|
|
|
25
48
|
FRAMEWORKS = Literal[
|
|
@@ -34,10 +57,16 @@ FRAMEWORKS = Literal[
|
|
|
34
57
|
"PyLate",
|
|
35
58
|
"ColBERT",
|
|
36
59
|
"ColPali",
|
|
60
|
+
"GGUF",
|
|
61
|
+
"safetensors",
|
|
62
|
+
"ONNX",
|
|
63
|
+
"Transformers",
|
|
37
64
|
]
|
|
38
65
|
|
|
66
|
+
MODEL_TYPES = Literal["dense", "cross-encoder", "late-interaction"]
|
|
67
|
+
|
|
39
68
|
|
|
40
|
-
class ScoringFunction(
|
|
69
|
+
class ScoringFunction(HelpfulStrEnum):
|
|
41
70
|
"""The scoring function used by the models."""
|
|
42
71
|
|
|
43
72
|
COSINE = "cosine"
|
|
@@ -72,7 +101,7 @@ class ModelMeta(BaseModel):
|
|
|
72
101
|
models).
|
|
73
102
|
embed_dim: The dimension of the embeddings produced by the model. Currently all models are assumed to produce fixed-size embeddings.
|
|
74
103
|
revision: The revision number of the model. If None, it is assumed that the metadata (including the loader) is valid for all revisions of the model.
|
|
75
|
-
release_date: The date the model's revision was released.
|
|
104
|
+
release_date: The date the model's revision was released. If None, then release date will be added based on 1st commit in hf repository of model.
|
|
76
105
|
license: The license under which the model is released. Required if open_weights is True.
|
|
77
106
|
open_weights: Whether the model is open source or proprietary.
|
|
78
107
|
public_training_code: A link to the publicly available training code. If None, it is assumed that the training code is not publicly available.
|
|
@@ -90,7 +119,7 @@ class ModelMeta(BaseModel):
|
|
|
90
119
|
a benchmark as well as mark dataset contaminations.
|
|
91
120
|
adapted_from: Name of the model from which this model is adapted. For quantizations, fine-tunes, long doc extensions, etc.
|
|
92
121
|
superseded_by: Name of the model that supersedes this model, e.g., nvidia/NV-Embed-v2 supersedes v1.
|
|
93
|
-
|
|
122
|
+
model_type: A list of strings representing the type of model.
|
|
94
123
|
modalities: A list of strings representing the modalities the model supports. Default is ["text"].
|
|
95
124
|
contacts: The people to contact in case of a problem in the model, preferably a GitHub handle.
|
|
96
125
|
"""
|
|
@@ -120,10 +149,49 @@ class ModelMeta(BaseModel):
|
|
|
120
149
|
adapted_from: str | None = None
|
|
121
150
|
superseded_by: str | None = None
|
|
122
151
|
modalities: list[Modalities] = ["text"]
|
|
123
|
-
|
|
152
|
+
model_type: list[MODEL_TYPES] = ["dense"]
|
|
124
153
|
citation: str | None = None
|
|
125
154
|
contacts: list[str] | None = None
|
|
126
155
|
|
|
156
|
+
@model_validator(mode="before")
|
|
157
|
+
@classmethod
|
|
158
|
+
def handle_legacy_is_cross_encoder(cls, data: Any) -> Any:
|
|
159
|
+
"""Handle legacy is_cross_encoder field by converting it to model_type.
|
|
160
|
+
|
|
161
|
+
This validator handles backward compatibility for the deprecated is_cross_encoder field.
|
|
162
|
+
If is_cross_encoder=True is provided, it adds "cross_encoder" to model_type.
|
|
163
|
+
"""
|
|
164
|
+
if isinstance(data, dict) and "is_cross_encoder" in data:
|
|
165
|
+
is_cross_encoder_value = data.pop("is_cross_encoder")
|
|
166
|
+
|
|
167
|
+
if is_cross_encoder_value is not None:
|
|
168
|
+
warnings.warn(
|
|
169
|
+
"is_cross_encoder is deprecated and will be removed in a future version. "
|
|
170
|
+
"Use model_type=['cross-encoder'] instead.",
|
|
171
|
+
DeprecationWarning,
|
|
172
|
+
stacklevel=2,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
model_type = data.get("model_type", ["dense"])
|
|
176
|
+
|
|
177
|
+
if is_cross_encoder_value:
|
|
178
|
+
if "cross-encoder" not in model_type:
|
|
179
|
+
data["model_type"] = ["cross-encoder"]
|
|
180
|
+
else:
|
|
181
|
+
if "cross-encoder" in model_type:
|
|
182
|
+
model_type = [t for t in model_type if t != "cross-encoder"]
|
|
183
|
+
data["model_type"] = model_type if model_type else ["dense"]
|
|
184
|
+
|
|
185
|
+
return data
|
|
186
|
+
|
|
187
|
+
@property
|
|
188
|
+
def is_cross_encoder(self) -> bool:
|
|
189
|
+
"""Returns True if the model is a cross-encoder.
|
|
190
|
+
|
|
191
|
+
Derived from model_type field. A model is considered a cross-encoder if "cross-encoder" is in its model_type list.
|
|
192
|
+
"""
|
|
193
|
+
return "cross-encoder" in self.model_type
|
|
194
|
+
|
|
127
195
|
@field_validator("similarity_fn_name", mode="before")
|
|
128
196
|
@classmethod
|
|
129
197
|
def _validate_similarity_fn_name(cls, value: str) -> ScoringFunction | None:
|
|
@@ -159,6 +227,7 @@ class ModelMeta(BaseModel):
|
|
|
159
227
|
else dict_repr["training_datasets"]
|
|
160
228
|
)
|
|
161
229
|
dict_repr["loader"] = _get_loader_name(loader)
|
|
230
|
+
dict_repr["is_cross_encoder"] = self.is_cross_encoder
|
|
162
231
|
return dict_repr
|
|
163
232
|
|
|
164
233
|
@field_validator("languages")
|
|
@@ -184,7 +253,7 @@ class ModelMeta(BaseModel):
|
|
|
184
253
|
)
|
|
185
254
|
return v
|
|
186
255
|
|
|
187
|
-
def load_model(self, **kwargs: Any) -> MTEBModels:
|
|
256
|
+
def load_model(self, device: str | None = None, **kwargs: Any) -> MTEBModels:
|
|
188
257
|
"""Loads the model using the specified loader function."""
|
|
189
258
|
if self.loader is None:
|
|
190
259
|
raise NotImplementedError(
|
|
@@ -196,11 +265,11 @@ class ModelMeta(BaseModel):
|
|
|
196
265
|
# Allow overwrites
|
|
197
266
|
_kwargs = self.loader_kwargs.copy()
|
|
198
267
|
_kwargs.update(kwargs)
|
|
268
|
+
if device is not None:
|
|
269
|
+
_kwargs["device"] = device
|
|
199
270
|
|
|
200
|
-
model:
|
|
201
|
-
|
|
202
|
-
)
|
|
203
|
-
model.mteb_model_meta = self # type: ignore
|
|
271
|
+
model: MTEBModels = self.loader(self.name, revision=self.revision, **_kwargs)
|
|
272
|
+
model.mteb_model_meta = self # type: ignore[misc]
|
|
204
273
|
return model
|
|
205
274
|
|
|
206
275
|
def model_name_as_path(self) -> str:
|
|
@@ -212,9 +281,188 @@ class ModelMeta(BaseModel):
|
|
|
212
281
|
raise ValueError("Model name is not set")
|
|
213
282
|
return self.name.replace("/", "__").replace(" ", "_")
|
|
214
283
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
284
|
+
@classmethod
|
|
285
|
+
def _from_hub(
|
|
286
|
+
cls,
|
|
287
|
+
model_name: str | None,
|
|
288
|
+
revision: str | None = None,
|
|
289
|
+
compute_metadata: bool = True,
|
|
290
|
+
) -> Self:
|
|
291
|
+
"""Generates a ModelMeta from a HuggingFace model name.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
model_name: The HuggingFace model name.
|
|
295
|
+
revision: Revision of the model
|
|
296
|
+
compute_metadata: Add metadata based on model card
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
The generated ModelMeta.
|
|
300
|
+
"""
|
|
301
|
+
from mteb.models import sentence_transformers_loader
|
|
302
|
+
|
|
303
|
+
loader = sentence_transformers_loader
|
|
304
|
+
frameworks: list[FRAMEWORKS] = ["PyTorch"]
|
|
305
|
+
model_license = None
|
|
306
|
+
reference = None
|
|
307
|
+
n_parameters = None
|
|
308
|
+
memory_usage_mb = None
|
|
309
|
+
release_date = None
|
|
310
|
+
embedding_dim = None
|
|
311
|
+
max_tokens = None
|
|
312
|
+
|
|
313
|
+
if model_name and compute_metadata and _repo_exists(model_name):
|
|
314
|
+
reference = "https://huggingface.co/" + model_name
|
|
315
|
+
card = ModelCard.load(model_name)
|
|
316
|
+
card_data: ModelCardData = card.data
|
|
317
|
+
try:
|
|
318
|
+
model_config = AutoConfig.from_pretrained(model_name)
|
|
319
|
+
except Exception as e:
|
|
320
|
+
# some models can't load AutoConfig (e.g. `average_word_embeddings_levy_dependency`)
|
|
321
|
+
model_config = None
|
|
322
|
+
logger.warning(f"Can't get configuration for {model_name}. Error: {e}")
|
|
323
|
+
|
|
324
|
+
hf_frameworks = (
|
|
325
|
+
cls._get_frameworks_from_hf_tags(model_name) if model_name else []
|
|
326
|
+
)
|
|
327
|
+
frameworks.extend(hf_frameworks)
|
|
328
|
+
|
|
329
|
+
if revision is None:
|
|
330
|
+
revisions = _get_repo_commits(model_name, "model")
|
|
331
|
+
revision = revisions[0].commit_id if revisions else None
|
|
332
|
+
|
|
333
|
+
release_date = cls.fetch_release_date(model_name)
|
|
334
|
+
model_license = card_data.license if card_data.license != "other" else None
|
|
335
|
+
n_parameters = cls._calculate_num_parameters_from_hub(model_name)
|
|
336
|
+
memory_usage_mb = cls._calculate_memory_usage_mb(model_name, n_parameters)
|
|
337
|
+
if model_config and hasattr(model_config, "hidden_size"):
|
|
338
|
+
embedding_dim = model_config.hidden_size
|
|
339
|
+
if model_config and hasattr(model_config, "max_position_embeddings"):
|
|
340
|
+
max_tokens = model_config.max_position_embeddings
|
|
341
|
+
|
|
342
|
+
return cls(
|
|
343
|
+
loader=loader,
|
|
344
|
+
name=model_name or "no_model_name/available",
|
|
345
|
+
revision=revision or "no_revision_available",
|
|
346
|
+
reference=reference,
|
|
347
|
+
release_date=release_date,
|
|
348
|
+
languages=None,
|
|
349
|
+
license=model_license,
|
|
350
|
+
framework=frameworks,
|
|
351
|
+
training_datasets=None,
|
|
352
|
+
similarity_fn_name=None,
|
|
353
|
+
n_parameters=n_parameters,
|
|
354
|
+
memory_usage_mb=memory_usage_mb,
|
|
355
|
+
max_tokens=max_tokens,
|
|
356
|
+
embed_dim=embedding_dim,
|
|
357
|
+
open_weights=True,
|
|
358
|
+
public_training_code=None,
|
|
359
|
+
public_training_data=None,
|
|
360
|
+
use_instructions=None,
|
|
361
|
+
modalities=[],
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
@classmethod
|
|
365
|
+
def from_sentence_transformer_model(
|
|
366
|
+
cls,
|
|
367
|
+
model: SentenceTransformer,
|
|
368
|
+
revision: str | None = None,
|
|
369
|
+
compute_metadata: bool = True,
|
|
370
|
+
) -> Self:
|
|
371
|
+
"""Generates a ModelMeta from a SentenceTransformer model.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
model: SentenceTransformer model.
|
|
375
|
+
revision: Revision of the model
|
|
376
|
+
compute_metadata: Add metadata based on model card
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
The generated ModelMeta.
|
|
380
|
+
"""
|
|
381
|
+
name: str | None = (
|
|
382
|
+
model.model_card_data.model_name
|
|
383
|
+
if model.model_card_data.model_name
|
|
384
|
+
else model.model_card_data.base_model
|
|
385
|
+
)
|
|
386
|
+
meta = cls._from_hub(name, revision, compute_metadata)
|
|
387
|
+
meta.revision = model.model_card_data.base_model_revision or meta.revision
|
|
388
|
+
meta.max_tokens = model.max_seq_length
|
|
389
|
+
meta.embed_dim = model.get_sentence_embedding_dimension()
|
|
390
|
+
meta.similarity_fn_name = ScoringFunction.from_str(model.similarity_fn_name)
|
|
391
|
+
meta.modalities = ["text"]
|
|
392
|
+
return meta
|
|
393
|
+
|
|
394
|
+
@classmethod
|
|
395
|
+
def from_hub(
|
|
396
|
+
cls,
|
|
397
|
+
model: str,
|
|
398
|
+
revision: str | None = None,
|
|
399
|
+
compute_metadata: bool = True,
|
|
400
|
+
) -> Self:
|
|
401
|
+
"""Generates a ModelMeta for model from HuggingFace hub.
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
model: Name of the model from HuggingFace hub. For example, `intfloat/multilingual-e5-large`
|
|
405
|
+
revision: Revision of the model
|
|
406
|
+
compute_metadata: Add metadata based on model card
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
The generated ModelMeta.
|
|
410
|
+
"""
|
|
411
|
+
meta = cls._from_hub(model, revision, compute_metadata)
|
|
412
|
+
meta.modalities = ["text"]
|
|
413
|
+
|
|
414
|
+
if model and compute_metadata and _repo_exists(model):
|
|
415
|
+
# have max_seq_length field
|
|
416
|
+
sbert_config = _get_json_from_hub(
|
|
417
|
+
model, "sentence_bert_config.json", "model", revision=revision
|
|
418
|
+
)
|
|
419
|
+
if sbert_config:
|
|
420
|
+
meta.max_tokens = (
|
|
421
|
+
sbert_config.get("max_seq_length", None) or meta.max_tokens
|
|
422
|
+
)
|
|
423
|
+
# have model type, similarity function fields
|
|
424
|
+
config_sbert = _get_json_from_hub(
|
|
425
|
+
model, "config_sentence_transformers.json", "model", revision=revision
|
|
426
|
+
)
|
|
427
|
+
if (
|
|
428
|
+
config_sbert is not None
|
|
429
|
+
and config_sbert.get("similarity_fn_name") is not None
|
|
430
|
+
):
|
|
431
|
+
meta.similarity_fn_name = ScoringFunction.from_str(
|
|
432
|
+
config_sbert["similarity_fn_name"]
|
|
433
|
+
)
|
|
434
|
+
else:
|
|
435
|
+
meta.similarity_fn_name = ScoringFunction.COSINE
|
|
436
|
+
return meta
|
|
437
|
+
|
|
438
|
+
@classmethod
|
|
439
|
+
def from_cross_encoder(
|
|
440
|
+
cls,
|
|
441
|
+
model: CrossEncoder,
|
|
442
|
+
revision: str | None = None,
|
|
443
|
+
compute_metadata: bool = True,
|
|
444
|
+
) -> Self:
|
|
445
|
+
"""Generates a ModelMeta from a CrossEncoder.
|
|
446
|
+
|
|
447
|
+
Args:
|
|
448
|
+
model: The CrossEncoder model
|
|
449
|
+
revision: Revision of the model
|
|
450
|
+
compute_metadata: Add metadata based on model card
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
The generated ModelMeta
|
|
454
|
+
"""
|
|
455
|
+
from mteb.models import CrossEncoderWrapper
|
|
456
|
+
|
|
457
|
+
meta = cls._from_hub(model.model.name_or_path, revision, compute_metadata)
|
|
458
|
+
meta.revision = model.config._commit_hash or meta.revision
|
|
459
|
+
meta.loader = CrossEncoderWrapper
|
|
460
|
+
meta.embed_dim = None
|
|
461
|
+
meta.modalities = ["text"]
|
|
462
|
+
meta.model_type = ["cross-encoder"]
|
|
463
|
+
return meta
|
|
464
|
+
|
|
465
|
+
def is_zero_shot_on(self, tasks: Sequence[AbsTask] | Sequence[str]) -> bool | None:
|
|
218
466
|
"""Indicates whether the given model can be considered zero-shot or not on the given tasks.
|
|
219
467
|
|
|
220
468
|
Returns:
|
|
@@ -255,10 +503,12 @@ class ModelMeta(BaseModel):
|
|
|
255
503
|
if adapted_training_datasets is not None:
|
|
256
504
|
training_datasets |= adapted_training_datasets
|
|
257
505
|
except (ValueError, KeyError) as e:
|
|
258
|
-
|
|
506
|
+
msg = f"Could not get source model: {e} in MTEB"
|
|
507
|
+
logger.warning(msg)
|
|
508
|
+
warnings.warn(msg)
|
|
259
509
|
|
|
260
510
|
return_dataset = training_datasets.copy()
|
|
261
|
-
visited = set()
|
|
511
|
+
visited: set[str] = set()
|
|
262
512
|
|
|
263
513
|
for dataset in training_datasets:
|
|
264
514
|
similar_tasks = _collect_similar_tasks(dataset, visited)
|
|
@@ -267,7 +517,7 @@ class ModelMeta(BaseModel):
|
|
|
267
517
|
return return_dataset
|
|
268
518
|
|
|
269
519
|
def zero_shot_percentage(
|
|
270
|
-
self, tasks: Sequence[
|
|
520
|
+
self, tasks: Sequence[AbsTask] | Sequence[str]
|
|
271
521
|
) -> int | None:
|
|
272
522
|
"""Indicates how out-of-domain the selected tasks are for the given model.
|
|
273
523
|
|
|
@@ -290,18 +540,40 @@ class ModelMeta(BaseModel):
|
|
|
290
540
|
perc_overlap = 100 * (len(overlap) / len(benchmark_datasets))
|
|
291
541
|
return int(100 - perc_overlap)
|
|
292
542
|
|
|
293
|
-
|
|
294
|
-
|
|
543
|
+
@staticmethod
|
|
544
|
+
def _calculate_num_parameters_from_hub(model_name: str | None = None) -> int | None:
|
|
545
|
+
if not model_name:
|
|
546
|
+
return None
|
|
547
|
+
try:
|
|
548
|
+
safetensors_metadata = get_safetensors_metadata(model_name)
|
|
549
|
+
if len(safetensors_metadata.parameter_count) >= 0:
|
|
550
|
+
return sum(safetensors_metadata.parameter_count.values())
|
|
551
|
+
except (
|
|
552
|
+
NotASafetensorsRepoError,
|
|
553
|
+
SafetensorsParsingError,
|
|
554
|
+
GatedRepoError,
|
|
555
|
+
RepositoryNotFoundError,
|
|
556
|
+
) as e:
|
|
557
|
+
logger.warning(
|
|
558
|
+
f"Can't calculate number of parameters for {model_name}. Got error {e}"
|
|
559
|
+
)
|
|
560
|
+
return None
|
|
561
|
+
|
|
562
|
+
def calculate_num_parameters_from_hub(self) -> int | None:
|
|
563
|
+
"""Calculates the number of parameters in the model.
|
|
295
564
|
|
|
296
565
|
Returns:
|
|
297
|
-
|
|
566
|
+
Number of parameters in the model.
|
|
298
567
|
"""
|
|
299
|
-
|
|
300
|
-
return None
|
|
568
|
+
return self._calculate_num_parameters_from_hub(self.name)
|
|
301
569
|
|
|
570
|
+
@staticmethod
|
|
571
|
+
def _calculate_memory_usage_mb(
|
|
572
|
+
model_name: str, n_parameters: int | None
|
|
573
|
+
) -> int | None:
|
|
302
574
|
MB = 1024**2 # noqa: N806
|
|
303
575
|
try:
|
|
304
|
-
safetensors_metadata = get_safetensors_metadata(
|
|
576
|
+
safetensors_metadata = get_safetensors_metadata(model_name)
|
|
305
577
|
if len(safetensors_metadata.parameter_count) >= 0:
|
|
306
578
|
dtype_size_map = {
|
|
307
579
|
"F64": 8, # 64-bit float
|
|
@@ -320,18 +592,167 @@ class ModelMeta(BaseModel):
|
|
|
320
592
|
for dtype, parameters in safetensors_metadata.parameter_count.items()
|
|
321
593
|
)
|
|
322
594
|
return round(total_memory_bytes / MB) # Convert to MB
|
|
595
|
+
except (
|
|
596
|
+
NotASafetensorsRepoError,
|
|
597
|
+
SafetensorsParsingError,
|
|
598
|
+
GatedRepoError,
|
|
599
|
+
RepositoryNotFoundError,
|
|
600
|
+
) as e:
|
|
601
|
+
logger.warning(
|
|
602
|
+
f"Can't calculate memory usage for {model_name}. Got error {e}"
|
|
603
|
+
)
|
|
323
604
|
|
|
324
|
-
|
|
325
|
-
pass
|
|
326
|
-
if self.n_parameters is None:
|
|
605
|
+
if n_parameters is None:
|
|
327
606
|
return None
|
|
328
607
|
# Model memory in bytes. For FP32 each parameter is 4 bytes.
|
|
329
|
-
model_memory_bytes =
|
|
608
|
+
model_memory_bytes = n_parameters * 4
|
|
330
609
|
|
|
331
610
|
# Convert to MB
|
|
332
611
|
model_memory_mb = model_memory_bytes / MB
|
|
333
612
|
return round(model_memory_mb)
|
|
334
613
|
|
|
614
|
+
def calculate_memory_usage_mb(self) -> int | None:
|
|
615
|
+
"""Calculates the memory usage of the model in MB.
|
|
616
|
+
|
|
617
|
+
Returns:
|
|
618
|
+
The memory usage of the model in MB, or None if it cannot be determined.
|
|
619
|
+
"""
|
|
620
|
+
if "API" in self.framework or self.name is None:
|
|
621
|
+
return None
|
|
622
|
+
|
|
623
|
+
return self._calculate_memory_usage_mb(self.name, self.n_parameters)
|
|
624
|
+
|
|
625
|
+
@staticmethod
|
|
626
|
+
def fetch_release_date(model_name: str) -> StrDate | None:
|
|
627
|
+
"""Fetches the release date from HuggingFace Hub based on the first commit.
|
|
628
|
+
|
|
629
|
+
Returns:
|
|
630
|
+
The release date in YYYY-MM-DD format, or None if it cannot be determined.
|
|
631
|
+
"""
|
|
632
|
+
commits = _get_repo_commits(repo_id=model_name, repo_type="model")
|
|
633
|
+
if commits:
|
|
634
|
+
initial_commit = commits[-1]
|
|
635
|
+
release_date = initial_commit.created_at.strftime("%Y-%m-%d")
|
|
636
|
+
return release_date
|
|
637
|
+
return None
|
|
638
|
+
|
|
639
|
+
@staticmethod
|
|
640
|
+
def _get_frameworks_from_hf_tags(model_name: str) -> list[FRAMEWORKS]:
|
|
641
|
+
"""Extract frameworks supported by the model from HuggingFace model tags.
|
|
642
|
+
|
|
643
|
+
Args:
|
|
644
|
+
model_name: HuggingFace model name
|
|
645
|
+
|
|
646
|
+
Returns:
|
|
647
|
+
List of framework names found in tags. Defaults to empty list if no frameworks found.
|
|
648
|
+
"""
|
|
649
|
+
try:
|
|
650
|
+
info = model_info(model_name)
|
|
651
|
+
if not info.tags:
|
|
652
|
+
return []
|
|
653
|
+
except Exception as e:
|
|
654
|
+
logger.warning(
|
|
655
|
+
f"Failed to fetch frameworks from HuggingFace tags for {model_name}: {e}"
|
|
656
|
+
)
|
|
657
|
+
return []
|
|
658
|
+
|
|
659
|
+
# Mapping from HuggingFace tags to MTEB framework names
|
|
660
|
+
tag_to_framework: dict[str, FRAMEWORKS] = {
|
|
661
|
+
"sentence-transformers": "Sentence Transformers",
|
|
662
|
+
"transformers": "Transformers",
|
|
663
|
+
"onnx": "ONNX",
|
|
664
|
+
"safetensors": "safetensors",
|
|
665
|
+
"gguf": "GGUF",
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
frameworks: list[FRAMEWORKS] = []
|
|
669
|
+
|
|
670
|
+
for framework_tag in tag_to_framework.keys():
|
|
671
|
+
if framework_tag in info.tags:
|
|
672
|
+
frameworks.append(tag_to_framework[framework_tag])
|
|
673
|
+
|
|
674
|
+
return frameworks
|
|
675
|
+
|
|
676
|
+
def to_python(self) -> str:
|
|
677
|
+
"""Returns a string representation of the model."""
|
|
678
|
+
return _pydantic_instance_to_code(self)
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
def _pydantic_instance_to_code(
|
|
682
|
+
model: BaseModel,
|
|
683
|
+
indent: int = 4,
|
|
684
|
+
*,
|
|
685
|
+
only_set_fields: bool = False,
|
|
686
|
+
) -> str:
|
|
687
|
+
"""Convert a Pydantic model instance into valid Python constructor code.
|
|
688
|
+
|
|
689
|
+
If only_set_fields=True, only fields explicitly provided at model construction
|
|
690
|
+
time are printed (i.e., excludes fields that came only from defaults).
|
|
691
|
+
|
|
692
|
+
Arguments:
|
|
693
|
+
model: The Pydantic model to convert.
|
|
694
|
+
indent: The indentation to use.
|
|
695
|
+
only_set_fields: If True, only fields explicitly provided at model construction time
|
|
696
|
+
"""
|
|
697
|
+
cls_name = model.__class__.__name__
|
|
698
|
+
pad = " " * indent
|
|
699
|
+
lines: list[str] = [f"{cls_name}("]
|
|
700
|
+
|
|
701
|
+
model_fields = list(type(model).model_fields.keys())
|
|
702
|
+
|
|
703
|
+
if only_set_fields:
|
|
704
|
+
field_names = [n for n in model_fields if n in model.model_fields_set]
|
|
705
|
+
else:
|
|
706
|
+
field_names = model_fields
|
|
707
|
+
|
|
708
|
+
for field_name in field_names:
|
|
709
|
+
value = getattr(model, field_name)
|
|
710
|
+
value_code = _value_to_code(value, indent)
|
|
711
|
+
lines.append(f"{pad}{field_name}={value_code},")
|
|
712
|
+
|
|
713
|
+
lines.append(")")
|
|
714
|
+
return "\n".join(lines)
|
|
715
|
+
|
|
716
|
+
|
|
717
|
+
def _value_to_code(value: Any, indent: int) -> str:
|
|
718
|
+
"""Convert a Python value into valid Python source code."""
|
|
719
|
+
if isinstance(value, BaseModel):
|
|
720
|
+
return _pydantic_instance_to_code(value, indent, only_set_fields=True)
|
|
721
|
+
|
|
722
|
+
if callable(value):
|
|
723
|
+
if isinstance(value, partial):
|
|
724
|
+
return value.func.__name__
|
|
725
|
+
return value.__name__
|
|
726
|
+
|
|
727
|
+
if isinstance(value, Enum):
|
|
728
|
+
return f"{value.__class__.__name__}.{value.name}"
|
|
729
|
+
|
|
730
|
+
if isinstance(value, str):
|
|
731
|
+
return repr(value)
|
|
732
|
+
|
|
733
|
+
if isinstance(value, list):
|
|
734
|
+
if not value:
|
|
735
|
+
return "[]"
|
|
736
|
+
inner = ", ".join(_value_to_code(v, indent) for v in value)
|
|
737
|
+
return f"[{inner}]"
|
|
738
|
+
|
|
739
|
+
if isinstance(value, set):
|
|
740
|
+
if not value:
|
|
741
|
+
return "set()"
|
|
742
|
+
inner = ", ".join(_value_to_code(v, indent) for v in sorted(value))
|
|
743
|
+
return f"{{{inner}}}"
|
|
744
|
+
|
|
745
|
+
if isinstance(value, dict):
|
|
746
|
+
if not value:
|
|
747
|
+
return "{}"
|
|
748
|
+
inner = ", ".join(
|
|
749
|
+
f"{_value_to_code(k, indent)}: {_value_to_code(v, indent)}"
|
|
750
|
+
for k, v in value.items()
|
|
751
|
+
)
|
|
752
|
+
return f"{{{inner}}}"
|
|
753
|
+
|
|
754
|
+
return repr(value)
|
|
755
|
+
|
|
335
756
|
|
|
336
757
|
def _collect_similar_tasks(dataset: str, visited: set[str]) -> set[str]:
|
|
337
758
|
"""Recursively collect all similar tasks for a given dataset.
|
|
@@ -364,3 +785,51 @@ def _collect_similar_tasks(dataset: str, visited: set[str]) -> set[str]:
|
|
|
364
785
|
similar.update(_collect_similar_tasks(parent, visited))
|
|
365
786
|
|
|
366
787
|
return similar
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
def _get_repo_commits(repo_id: str, repo_type: str) -> list[GitCommitInfo] | None:
|
|
791
|
+
try:
|
|
792
|
+
return list_repo_commits(repo_id=repo_id, repo_type=repo_type)
|
|
793
|
+
except (GatedRepoError, RepositoryNotFoundError) as e:
|
|
794
|
+
logger.warning(f"Can't get commits of {repo_id}: {e}")
|
|
795
|
+
return None
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
def _get_json_from_hub(
|
|
799
|
+
repo_id: str, file_name: str, repo_type: str, revision: str | None = None
|
|
800
|
+
) -> dict[str, Any] | None:
|
|
801
|
+
path = _get_file_on_hub(repo_id, file_name, repo_type, revision)
|
|
802
|
+
if path is None:
|
|
803
|
+
return None
|
|
804
|
+
|
|
805
|
+
with Path(path).open() as f:
|
|
806
|
+
js = json.load(f)
|
|
807
|
+
return js
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
def _get_file_on_hub(
|
|
811
|
+
repo_id: str, file_name: str, repo_type: str, revision: str | None = None
|
|
812
|
+
) -> str | None:
|
|
813
|
+
try:
|
|
814
|
+
return hf_hub_download(
|
|
815
|
+
repo_id=repo_id, filename=file_name, repo_type=repo_type, revision=revision
|
|
816
|
+
)
|
|
817
|
+
except (GatedRepoError, RepositoryNotFoundError, EntryNotFoundError) as e:
|
|
818
|
+
logger.warning(f"Can't get file {file_name} of {repo_id}: {e}")
|
|
819
|
+
return None
|
|
820
|
+
|
|
821
|
+
|
|
822
|
+
def _repo_exists(repo_id: str, repo_type: str | None = None) -> bool:
|
|
823
|
+
"""Checks if a repository exists on HuggingFace Hub.
|
|
824
|
+
|
|
825
|
+
Repo exists will raise HFValidationError for invalid local paths
|
|
826
|
+
|
|
827
|
+
Args:
|
|
828
|
+
repo_id: The repository ID.
|
|
829
|
+
repo_type: The type of repository (e.g., "model", "dataset", "space").
|
|
830
|
+
"""
|
|
831
|
+
try:
|
|
832
|
+
return repo_exists(repo_id=repo_id, repo_type=repo_type)
|
|
833
|
+
except HFValidationError as e:
|
|
834
|
+
logger.warning(f"Can't check existence of {repo_id}: {e}")
|
|
835
|
+
return False
|