mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +6 -0
- mteb/_create_dataloaders.py +22 -20
- mteb/_evaluators/any_sts_evaluator.py +23 -14
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +3 -3
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
- mteb/_evaluators/pair_classification_evaluator.py +34 -40
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +25 -37
- mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
- mteb/_evaluators/text/summarization_evaluator.py +27 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +105 -0
- mteb/abstasks/_statistics_calculation.py +23 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -12
- mteb/abstasks/clustering.py +20 -16
- mteb/abstasks/clustering_legacy.py +13 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +33 -22
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +14 -4
- mteb/abstasks/task_metadata.py +32 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +77 -16
- mteb/benchmarks/benchmarks/__init__.py +12 -0
- mteb/benchmarks/benchmarks/benchmarks.py +361 -16
- mteb/benchmarks/get_benchmark.py +14 -53
- mteb/cache.py +227 -37
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +71 -62
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +106 -75
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +414 -151
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/load_results.py +12 -12
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +31 -23
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +3 -3
- mteb/models/get_model_meta.py +25 -118
- mteb/models/instruct_wrapper.py +33 -9
- mteb/models/model_implementations/align_models.py +8 -1
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +9 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +101 -17
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +13 -2
- mteb/models/model_implementations/blip_models.py +43 -16
- mteb/models/model_implementations/bm25.py +5 -4
- mteb/models/model_implementations/bmretriever_models.py +10 -4
- mteb/models/model_implementations/cadet_models.py +10 -1
- mteb/models/model_implementations/cde_models.py +25 -4
- mteb/models/model_implementations/clip_models.py +9 -6
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +165 -3
- mteb/models/model_implementations/codesage_models.py +18 -3
- mteb/models/model_implementations/cohere_models.py +13 -6
- mteb/models/model_implementations/cohere_v.py +7 -2
- mteb/models/model_implementations/colpali_models.py +17 -9
- mteb/models/model_implementations/colqwen_models.py +275 -5
- mteb/models/model_implementations/colsmol_models.py +4 -2
- mteb/models/model_implementations/conan_models.py +2 -1
- mteb/models/model_implementations/dino_models.py +194 -23
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +21 -110
- mteb/models/model_implementations/e5_v.py +7 -6
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +67 -9
- mteb/models/model_implementations/facebookai.py +205 -0
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +17 -10
- mteb/models/model_implementations/google_models.py +17 -6
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
- mteb/models/model_implementations/gritlm_models.py +4 -2
- mteb/models/model_implementations/gte_models.py +99 -9
- mteb/models/model_implementations/hinvec_models.py +2 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +256 -3
- mteb/models/model_implementations/jina_clip.py +49 -10
- mteb/models/model_implementations/jina_models.py +222 -11
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +37 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +4 -3
- mteb/models/model_implementations/listconranker.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +9 -6
- mteb/models/model_implementations/llm2vec_models.py +16 -8
- mteb/models/model_implementations/mcinext_models.py +7 -1
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +422 -60
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +15 -4
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +27 -14
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
- mteb/models/model_implementations/nomic_models.py +173 -6
- mteb/models/model_implementations/nomic_models_vision.py +8 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
- mteb/models/model_implementations/nvidia_models.py +155 -20
- mteb/models/model_implementations/octen_models.py +254 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +37 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
- mteb/models/model_implementations/ops_moa_models.py +5 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +9 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +12 -8
- mteb/models/model_implementations/pylate_models.py +46 -12
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +9 -6
- mteb/models/model_implementations/qzhou_models.py +5 -3
- mteb/models/model_implementations/random_baseline.py +19 -24
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +2 -1
- mteb/models/model_implementations/repllama_models.py +5 -3
- mteb/models/model_implementations/rerankers_custom.py +15 -9
- mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +71 -20
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +6 -3
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +177 -18
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +30 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +11 -1
- mteb/models/model_implementations/uae_models.py +8 -1
- mteb/models/model_implementations/vdr_models.py +3 -1
- mteb/models/model_implementations/vi_vn_models.py +45 -6
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +5 -3
- mteb/models/model_implementations/voyage_models.py +99 -0
- mteb/models/model_implementations/voyage_v.py +17 -9
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +498 -29
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
- mteb/models/search_wrappers.py +197 -65
- mteb/models/sentence_transformer_wrapper.py +52 -32
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +114 -65
- mteb/results/model_result.py +63 -26
- mteb/results/task_result.py +117 -77
- mteb/similarity_functions.py +60 -7
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -3
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +2 -3
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +16 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +24 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +19 -2
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
- mteb/models/model_implementations/mxbai_models.py +0 -102
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
|
@@ -6,6 +6,7 @@ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loade
|
|
|
6
6
|
parsbert = ModelMeta(
|
|
7
7
|
loader=sentence_transformers_loader,
|
|
8
8
|
name="HooshvareLab/bert-base-parsbert-uncased",
|
|
9
|
+
model_type=["dense"],
|
|
9
10
|
languages=["fas-Arab"],
|
|
10
11
|
open_weights=True,
|
|
11
12
|
revision="d73a0e2c7492c33bd5819bcdb23eba207404dd19",
|
|
@@ -17,7 +18,7 @@ parsbert = ModelMeta(
|
|
|
17
18
|
max_tokens=512,
|
|
18
19
|
reference="https://huggingface.co/HooshvareLab/bert-base-parsbert-uncased",
|
|
19
20
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
20
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
21
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers"],
|
|
21
22
|
use_instructions=False,
|
|
22
23
|
public_training_code=None,
|
|
23
24
|
public_training_data=None,
|
|
@@ -41,6 +42,7 @@ parsbert = ModelMeta(
|
|
|
41
42
|
bert_zwnj = ModelMeta(
|
|
42
43
|
loader=sentence_transformers_loader,
|
|
43
44
|
name="m3hrdadfi/bert-zwnj-wnli-mean-tokens",
|
|
45
|
+
model_type=["dense"],
|
|
44
46
|
languages=["fas-Arab"],
|
|
45
47
|
open_weights=True,
|
|
46
48
|
revision="b9506ddc579ac8c398ae6dae680401ae0a1a5b23",
|
|
@@ -52,7 +54,7 @@ bert_zwnj = ModelMeta(
|
|
|
52
54
|
max_tokens=512,
|
|
53
55
|
reference="https://huggingface.co/m3hrdadfi/bert-zwnj-wnli-mean-tokens",
|
|
54
56
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
55
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
57
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers"],
|
|
56
58
|
use_instructions=False,
|
|
57
59
|
public_training_code=None,
|
|
58
60
|
public_training_data=None,
|
|
@@ -66,6 +68,7 @@ bert_zwnj = ModelMeta(
|
|
|
66
68
|
roberta_zwnj = ModelMeta(
|
|
67
69
|
loader=sentence_transformers_loader,
|
|
68
70
|
name="m3hrdadfi/roberta-zwnj-wnli-mean-tokens",
|
|
71
|
+
model_type=["dense"],
|
|
69
72
|
languages=["fas-Arab"],
|
|
70
73
|
open_weights=True,
|
|
71
74
|
revision="36f912ac44e22250aee16ea533a4ff8cd848c1a1",
|
|
@@ -77,7 +80,7 @@ roberta_zwnj = ModelMeta(
|
|
|
77
80
|
max_tokens=514,
|
|
78
81
|
reference="https://huggingface.co/m3hrdadfi/roberta-zwnj-wnli-mean-tokens",
|
|
79
82
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
80
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
83
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers"],
|
|
81
84
|
use_instructions=False,
|
|
82
85
|
public_training_code=None,
|
|
83
86
|
public_training_data=None,
|
|
@@ -90,6 +93,7 @@ roberta_zwnj = ModelMeta(
|
|
|
90
93
|
sentence_transformer_parsbert = ModelMeta(
|
|
91
94
|
loader=sentence_transformers_loader,
|
|
92
95
|
name="myrkur/sentence-transformer-parsbert-fa",
|
|
96
|
+
model_type=["dense"],
|
|
93
97
|
languages=["fas-Arab"],
|
|
94
98
|
open_weights=True,
|
|
95
99
|
revision="72bd0a3557622f0ae08a092f4643609e0b950cdd",
|
|
@@ -101,7 +105,7 @@ sentence_transformer_parsbert = ModelMeta(
|
|
|
101
105
|
max_tokens=512,
|
|
102
106
|
reference="https://huggingface.co/myrkur/sentence-transformer-parsbert-fa",
|
|
103
107
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
104
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
108
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
105
109
|
use_instructions=False,
|
|
106
110
|
public_training_code=None,
|
|
107
111
|
public_training_data=None,
|
|
@@ -125,7 +129,7 @@ tooka_bert_base = ModelMeta(
|
|
|
125
129
|
max_tokens=512,
|
|
126
130
|
reference="https://huggingface.co/PartAI/TookaBERT-Base",
|
|
127
131
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
128
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
132
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
|
|
129
133
|
use_instructions=False,
|
|
130
134
|
public_training_code=None,
|
|
131
135
|
public_training_data=None,
|
|
@@ -140,6 +144,7 @@ tooka_bert_base = ModelMeta(
|
|
|
140
144
|
tooka_sbert = ModelMeta(
|
|
141
145
|
loader=sentence_transformers_loader,
|
|
142
146
|
name="PartAI/Tooka-SBERT",
|
|
147
|
+
model_type=["dense"],
|
|
143
148
|
languages=["fas-Arab"],
|
|
144
149
|
open_weights=True,
|
|
145
150
|
revision="5d07f0c543aca654373b931ae07cd197769110fd",
|
|
@@ -151,16 +156,26 @@ tooka_sbert = ModelMeta(
|
|
|
151
156
|
max_tokens=512,
|
|
152
157
|
reference="https://huggingface.co/PartAI/Tooka-SBERT",
|
|
153
158
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
154
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
159
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
155
160
|
use_instructions=False,
|
|
156
161
|
public_training_code=None,
|
|
157
162
|
public_training_data=None,
|
|
158
163
|
training_datasets=None,
|
|
164
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
165
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
166
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
167
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
168
|
+
month = "11",
|
|
169
|
+
year = "2019",
|
|
170
|
+
publisher = "Association for Computational Linguistics",
|
|
171
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
172
|
+
}""",
|
|
159
173
|
)
|
|
160
174
|
|
|
161
175
|
fa_bert = ModelMeta(
|
|
162
176
|
loader=sentence_transformers_loader,
|
|
163
177
|
name="sbunlp/fabert",
|
|
178
|
+
model_type=["dense"],
|
|
164
179
|
languages=["fas-Arab"],
|
|
165
180
|
open_weights=True,
|
|
166
181
|
revision="a0e3973064c97768e121b9b95f21adc94e0ca3fb",
|
|
@@ -172,7 +187,7 @@ fa_bert = ModelMeta(
|
|
|
172
187
|
max_tokens=512,
|
|
173
188
|
reference="https://huggingface.co/sbunlp/fabert",
|
|
174
189
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
175
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
190
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
|
|
176
191
|
use_instructions=False,
|
|
177
192
|
public_training_code=None,
|
|
178
193
|
public_training_data=None,
|
|
@@ -180,11 +195,35 @@ fa_bert = ModelMeta(
|
|
|
180
195
|
# It's just a base model
|
|
181
196
|
# https://huggingface.co/datasets/sbunlp/hmblogs-v3
|
|
182
197
|
),
|
|
198
|
+
citation="""@inproceedings{masumi-etal-2025-fabert,
|
|
199
|
+
title = "{F}a{BERT}: Pre-training {BERT} on {P}ersian Blogs",
|
|
200
|
+
author = "Masumi, Mostafa and
|
|
201
|
+
Majd, Seyed Soroush and
|
|
202
|
+
Shamsfard, Mehrnoush and
|
|
203
|
+
Beigy, Hamid",
|
|
204
|
+
editor = "Bak, JinYeong and
|
|
205
|
+
Goot, Rob van der and
|
|
206
|
+
Jang, Hyeju and
|
|
207
|
+
Buaphet, Weerayut and
|
|
208
|
+
Ramponi, Alan and
|
|
209
|
+
Xu, Wei and
|
|
210
|
+
Ritter, Alan",
|
|
211
|
+
booktitle = "Proceedings of the Tenth Workshop on Noisy and User-generated Text",
|
|
212
|
+
month = may,
|
|
213
|
+
year = "2025",
|
|
214
|
+
address = "Albuquerque, New Mexico, USA",
|
|
215
|
+
publisher = "Association for Computational Linguistics",
|
|
216
|
+
url = "https://aclanthology.org/2025.wnut-1.10/",
|
|
217
|
+
doi = "10.18653/v1/2025.wnut-1.10",
|
|
218
|
+
pages = "85--96",
|
|
219
|
+
ISBN = "979-8-89176-232-9",
|
|
220
|
+
}""",
|
|
183
221
|
)
|
|
184
222
|
|
|
185
223
|
tooka_sbert_v2_small = ModelMeta(
|
|
186
224
|
loader=sentence_transformers_loader,
|
|
187
225
|
name="PartAI/Tooka-SBERT-V2-Small",
|
|
226
|
+
model_type=["dense"],
|
|
188
227
|
languages=["fas-Arab"],
|
|
189
228
|
open_weights=True,
|
|
190
229
|
revision="8bbed87e36669387f71437c061430ba56d1b496f",
|
|
@@ -196,16 +235,26 @@ tooka_sbert_v2_small = ModelMeta(
|
|
|
196
235
|
max_tokens=512,
|
|
197
236
|
reference="https://huggingface.co/PartAI/Tooka-SBERT-V2-Small",
|
|
198
237
|
similarity_fn_name="cosine",
|
|
199
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
238
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
200
239
|
use_instructions=False,
|
|
201
240
|
public_training_code=None,
|
|
202
241
|
public_training_data=None,
|
|
203
242
|
training_datasets=None,
|
|
243
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
244
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
245
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
246
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
247
|
+
month = "11",
|
|
248
|
+
year = "2019",
|
|
249
|
+
publisher = "Association for Computational Linguistics",
|
|
250
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
251
|
+
}""",
|
|
204
252
|
)
|
|
205
253
|
|
|
206
254
|
tooka_sbert_v2_large = ModelMeta(
|
|
207
255
|
loader=sentence_transformers_loader,
|
|
208
256
|
name="PartAI/Tooka-SBERT-V2-Large",
|
|
257
|
+
model_type=["dense"],
|
|
209
258
|
languages=["fas-Arab"],
|
|
210
259
|
open_weights=True,
|
|
211
260
|
revision="b59682efa961122cc0e4408296d5852870c82eae",
|
|
@@ -217,9 +266,18 @@ tooka_sbert_v2_large = ModelMeta(
|
|
|
217
266
|
max_tokens=512,
|
|
218
267
|
reference="https://huggingface.co/PartAI/Tooka-SBERT-V2-Large",
|
|
219
268
|
similarity_fn_name="cosine",
|
|
220
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
269
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
221
270
|
use_instructions=False,
|
|
222
271
|
public_training_code=None,
|
|
223
272
|
public_training_data=None,
|
|
224
273
|
training_datasets=None,
|
|
274
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
275
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
276
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
277
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
278
|
+
month = "11",
|
|
279
|
+
year = "2019",
|
|
280
|
+
publisher = "Association for Computational Linguistics",
|
|
281
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
282
|
+
}""",
|
|
225
283
|
)
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
from mteb.models import sentence_transformers_loader
|
|
2
|
+
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
3
|
+
|
|
4
|
+
XLMR_LANGUAGES = [
|
|
5
|
+
"afr-Latn",
|
|
6
|
+
"amh-Latn",
|
|
7
|
+
"ara-Latn",
|
|
8
|
+
"asm-Latn",
|
|
9
|
+
"aze-Latn",
|
|
10
|
+
"bel-Latn",
|
|
11
|
+
"bul-Latn",
|
|
12
|
+
"ben-Latn",
|
|
13
|
+
"ben-Beng",
|
|
14
|
+
"bre-Latn",
|
|
15
|
+
"bos-Latn",
|
|
16
|
+
"cat-Latn",
|
|
17
|
+
"ces-Latn",
|
|
18
|
+
"cym-Latn",
|
|
19
|
+
"dan-Latn",
|
|
20
|
+
"deu-Latn",
|
|
21
|
+
"ell-Latn",
|
|
22
|
+
"eng-Latn",
|
|
23
|
+
"epo-Latn",
|
|
24
|
+
"spa-Latn",
|
|
25
|
+
"est-Latn",
|
|
26
|
+
"eus-Latn",
|
|
27
|
+
"fas-Latn",
|
|
28
|
+
"fin-Latn",
|
|
29
|
+
"fra-Latn",
|
|
30
|
+
"fry-Latn",
|
|
31
|
+
"gle-Latn",
|
|
32
|
+
"gla-Latn",
|
|
33
|
+
"glg-Latn",
|
|
34
|
+
"guj-Latn",
|
|
35
|
+
"hau-Latn",
|
|
36
|
+
"heb-Latn",
|
|
37
|
+
"hin-Latn",
|
|
38
|
+
"hin-Deva",
|
|
39
|
+
"hrv-Latn",
|
|
40
|
+
"hun-Latn",
|
|
41
|
+
"hye-Latn",
|
|
42
|
+
"ind-Latn",
|
|
43
|
+
"isl-Latn",
|
|
44
|
+
"ita-Latn",
|
|
45
|
+
"jpn-Latn",
|
|
46
|
+
"jav-Latn",
|
|
47
|
+
"kat-Latn",
|
|
48
|
+
"kaz-Latn",
|
|
49
|
+
"khm-Latn",
|
|
50
|
+
"kan-Latn",
|
|
51
|
+
"kor-Latn",
|
|
52
|
+
"kur-Latn",
|
|
53
|
+
"kir-Latn",
|
|
54
|
+
"lat-Latn",
|
|
55
|
+
"lao-Latn",
|
|
56
|
+
"lit-Latn",
|
|
57
|
+
"lav-Latn",
|
|
58
|
+
"mlg-Latn",
|
|
59
|
+
"mkd-Latn",
|
|
60
|
+
"mal-Latn",
|
|
61
|
+
"mon-Latn",
|
|
62
|
+
"mar-Latn",
|
|
63
|
+
"msa-Latn",
|
|
64
|
+
"mya-Latn",
|
|
65
|
+
"nep-Latn",
|
|
66
|
+
"nld-Latn",
|
|
67
|
+
"nob-Latn",
|
|
68
|
+
"orm-Latn",
|
|
69
|
+
"ori-Latn",
|
|
70
|
+
"pan-Latn",
|
|
71
|
+
"pol-Latn",
|
|
72
|
+
"pus-Latn",
|
|
73
|
+
"por-Latn",
|
|
74
|
+
"ron-Latn",
|
|
75
|
+
"rus-Latn",
|
|
76
|
+
"san-Latn",
|
|
77
|
+
"snd-Latn",
|
|
78
|
+
"sin-Latn",
|
|
79
|
+
"slk-Latn",
|
|
80
|
+
"slv-Latn",
|
|
81
|
+
"som-Latn",
|
|
82
|
+
"sqi-Latn",
|
|
83
|
+
"srp-Latn",
|
|
84
|
+
"sun-Latn",
|
|
85
|
+
"swe-Latn",
|
|
86
|
+
"swa-Latn",
|
|
87
|
+
"tam-Latn",
|
|
88
|
+
"tam-Taml",
|
|
89
|
+
"tel-Latn",
|
|
90
|
+
"tel-Telu",
|
|
91
|
+
"tha-Latn",
|
|
92
|
+
"tgl-Latn",
|
|
93
|
+
"tur-Latn",
|
|
94
|
+
"uig-Latn",
|
|
95
|
+
"ukr-Latn",
|
|
96
|
+
"urd-Latn",
|
|
97
|
+
"urd-Arab",
|
|
98
|
+
"uzb-Latn",
|
|
99
|
+
"vie-Latn",
|
|
100
|
+
"xho-Latn",
|
|
101
|
+
"yid-Latn",
|
|
102
|
+
"zho-Hant",
|
|
103
|
+
"zho-Hans",
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
xlmr_base = ModelMeta(
|
|
108
|
+
loader=sentence_transformers_loader, # type: ignore[arg-type]
|
|
109
|
+
name="FacebookAI/xlm-roberta-base",
|
|
110
|
+
model_type=["dense"],
|
|
111
|
+
languages=XLMR_LANGUAGES,
|
|
112
|
+
open_weights=True,
|
|
113
|
+
revision="e73636d4f797dec63c3081bb6ed5c7b0bb3f2089",
|
|
114
|
+
release_date="2019-11-05", # arxiv paper release
|
|
115
|
+
n_parameters=278043648,
|
|
116
|
+
memory_usage_mb=1064,
|
|
117
|
+
embed_dim=768,
|
|
118
|
+
license="mit",
|
|
119
|
+
max_tokens=512,
|
|
120
|
+
reference="https://huggingface.co/FacebookAI/xlm-roberta-base",
|
|
121
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
122
|
+
framework=[
|
|
123
|
+
"Sentence Transformers",
|
|
124
|
+
"PyTorch",
|
|
125
|
+
"Transformers",
|
|
126
|
+
"ONNX",
|
|
127
|
+
"safetensors",
|
|
128
|
+
],
|
|
129
|
+
use_instructions=False,
|
|
130
|
+
public_training_code=None,
|
|
131
|
+
public_training_data=None,
|
|
132
|
+
training_datasets=set(),
|
|
133
|
+
citation="""@article{DBLP:journals/corr/abs-1911-02116,
|
|
134
|
+
author = {Alexis Conneau and
|
|
135
|
+
Kartikay Khandelwal and
|
|
136
|
+
Naman Goyal and
|
|
137
|
+
Vishrav Chaudhary and
|
|
138
|
+
Guillaume Wenzek and
|
|
139
|
+
Francisco Guzm{\'{a}}n and
|
|
140
|
+
Edouard Grave and
|
|
141
|
+
Myle Ott and
|
|
142
|
+
Luke Zettlemoyer and
|
|
143
|
+
Veselin Stoyanov},
|
|
144
|
+
title = {Unsupervised Cross-lingual Representation Learning at Scale},
|
|
145
|
+
journal = {CoRR},
|
|
146
|
+
volume = {abs/1911.02116},
|
|
147
|
+
year = {2019},
|
|
148
|
+
url = {http://arxiv.org/abs/1911.02116},
|
|
149
|
+
eprinttype = {arXiv},
|
|
150
|
+
eprint = {1911.02116},
|
|
151
|
+
timestamp = {Mon, 11 Nov 2019 18:38:09 +0100},
|
|
152
|
+
biburl = {https://dblp.org/rec/journals/corr/abs-1911-02116.bib},
|
|
153
|
+
bibsource = {dblp computer science bibliography, https://dblp.org}
|
|
154
|
+
}""",
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
xlmr_large = ModelMeta(
|
|
158
|
+
loader=sentence_transformers_loader, # type: ignore[arg-type]
|
|
159
|
+
name="FacebookAI/xlm-roberta-large",
|
|
160
|
+
model_type=["dense"],
|
|
161
|
+
languages=XLMR_LANGUAGES,
|
|
162
|
+
open_weights=True,
|
|
163
|
+
revision="c23d21b0620b635a76227c604d44e43a9f0ee389",
|
|
164
|
+
release_date="2019-11-05", # arxiv paper release
|
|
165
|
+
n_parameters=559890432,
|
|
166
|
+
memory_usage_mb=2141,
|
|
167
|
+
embed_dim=1024,
|
|
168
|
+
license="mit",
|
|
169
|
+
max_tokens=512,
|
|
170
|
+
reference="https://huggingface.co/FacebookAI/xlm-roberta-large",
|
|
171
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
172
|
+
framework=[
|
|
173
|
+
"Sentence Transformers",
|
|
174
|
+
"PyTorch",
|
|
175
|
+
"Transformers",
|
|
176
|
+
"ONNX",
|
|
177
|
+
"safetensors",
|
|
178
|
+
],
|
|
179
|
+
use_instructions=False,
|
|
180
|
+
public_training_code=None,
|
|
181
|
+
public_training_data=None,
|
|
182
|
+
training_datasets=set(),
|
|
183
|
+
citation="""@article{DBLP:journals/corr/abs-1911-02116,
|
|
184
|
+
author = {Alexis Conneau and
|
|
185
|
+
Kartikay Khandelwal and
|
|
186
|
+
Naman Goyal and
|
|
187
|
+
Vishrav Chaudhary and
|
|
188
|
+
Guillaume Wenzek and
|
|
189
|
+
Francisco Guzm{\'{a}}n and
|
|
190
|
+
Edouard Grave and
|
|
191
|
+
Myle Ott and
|
|
192
|
+
Luke Zettlemoyer and
|
|
193
|
+
Veselin Stoyanov},
|
|
194
|
+
title = {Unsupervised Cross-lingual Representation Learning at Scale},
|
|
195
|
+
journal = {CoRR},
|
|
196
|
+
volume = {abs/1911.02116},
|
|
197
|
+
year = {2019},
|
|
198
|
+
url = {http://arxiv.org/abs/1911.02116},
|
|
199
|
+
eprinttype = {arXiv},
|
|
200
|
+
eprint = {1911.02116},
|
|
201
|
+
timestamp = {Mon, 11 Nov 2019 18:38:09 +0100},
|
|
202
|
+
biburl = {https://dblp.org/rec/journals/corr/abs-1911-02116.bib},
|
|
203
|
+
bibsource = {dblp computer science bibliography, https://dblp.org}
|
|
204
|
+
}""",
|
|
205
|
+
)
|
|
@@ -7,6 +7,7 @@ from mteb.models.model_meta import ModelMeta
|
|
|
7
7
|
|
|
8
8
|
geoembedding = ModelMeta(
|
|
9
9
|
name="GeoGPT-Research-Project/GeoEmbedding",
|
|
10
|
+
model_type=["dense"],
|
|
10
11
|
languages=["eng-Latn"],
|
|
11
12
|
open_weights=True,
|
|
12
13
|
revision="29803c28ea7ef6871194a8ebc85ad7bfe174928e",
|
|
@@ -25,7 +26,7 @@ geoembedding = ModelMeta(
|
|
|
25
26
|
max_tokens=32768,
|
|
26
27
|
reference="https://huggingface.co/GeoGPT-Research-Project/GeoEmbedding",
|
|
27
28
|
similarity_fn_name="cosine",
|
|
28
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
29
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
29
30
|
use_instructions=True,
|
|
30
31
|
public_training_code=None,
|
|
31
32
|
public_training_data=None,
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import math
|
|
3
|
-
|
|
5
|
+
import warnings
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
4
7
|
|
|
5
8
|
import torch
|
|
6
|
-
from PIL import Image
|
|
7
9
|
from torch.utils.data import DataLoader
|
|
8
10
|
from tqdm.autonotebook import tqdm
|
|
9
11
|
|
|
@@ -12,6 +14,9 @@ from mteb.models.abs_encoder import AbsEncoder
|
|
|
12
14
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
13
15
|
from mteb.types import Array, BatchedInput, PromptType
|
|
14
16
|
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from PIL import Image
|
|
19
|
+
|
|
15
20
|
logger = logging.getLogger(__name__)
|
|
16
21
|
|
|
17
22
|
GME_CITATION = """@misc{zhang2024gme,
|
|
@@ -257,9 +262,9 @@ def smart_resize(
|
|
|
257
262
|
w_bar = ceil_by_factor(width * beta, factor)
|
|
258
263
|
|
|
259
264
|
if max(h_bar, w_bar) / min(h_bar, w_bar) > MAX_RATIO:
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
)
|
|
265
|
+
msg = f"Absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(h_bar, w_bar) / min(h_bar, w_bar)}"
|
|
266
|
+
logger.warning(msg)
|
|
267
|
+
warnings.warn(msg)
|
|
263
268
|
if h_bar > w_bar:
|
|
264
269
|
h_bar = w_bar * MAX_RATIO
|
|
265
270
|
else:
|
|
@@ -267,9 +272,9 @@ def smart_resize(
|
|
|
267
272
|
return h_bar, w_bar
|
|
268
273
|
|
|
269
274
|
|
|
270
|
-
def fetch_image(
|
|
271
|
-
|
|
272
|
-
|
|
275
|
+
def fetch_image(image: Image.Image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
|
|
276
|
+
from PIL import Image
|
|
277
|
+
|
|
273
278
|
image_obj = None
|
|
274
279
|
if isinstance(image, Image.Image):
|
|
275
280
|
image_obj = image
|
|
@@ -342,6 +347,7 @@ training_data = {
|
|
|
342
347
|
gme_qwen2vl_2b = ModelMeta(
|
|
343
348
|
loader=GmeQwen2VL,
|
|
344
349
|
name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
|
|
350
|
+
model_type=["dense"],
|
|
345
351
|
languages=["eng-Latn", "cmn-Hans"],
|
|
346
352
|
open_weights=True,
|
|
347
353
|
revision="ce765ae71b8cdb208203cd8fb64a170b1b84293a",
|
|
@@ -354,7 +360,7 @@ gme_qwen2vl_2b = ModelMeta(
|
|
|
354
360
|
max_tokens=32768,
|
|
355
361
|
reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
|
|
356
362
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
357
|
-
framework=["PyTorch"],
|
|
363
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors", "Transformers"],
|
|
358
364
|
use_instructions=True,
|
|
359
365
|
public_training_code=None,
|
|
360
366
|
public_training_data=None,
|
|
@@ -365,6 +371,7 @@ gme_qwen2vl_2b = ModelMeta(
|
|
|
365
371
|
gme_qwen2vl_7b = ModelMeta(
|
|
366
372
|
loader=GmeQwen2VL,
|
|
367
373
|
name="Alibaba-NLP/gme-Qwen2-VL-7B-Instruct",
|
|
374
|
+
model_type=["dense"],
|
|
368
375
|
languages=["eng-Latn", "cmn-Hans"],
|
|
369
376
|
open_weights=True,
|
|
370
377
|
revision="477027a6480f8630363be77751f169cc3434b673",
|
|
@@ -377,7 +384,7 @@ gme_qwen2vl_7b = ModelMeta(
|
|
|
377
384
|
max_tokens=32768,
|
|
378
385
|
reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct",
|
|
379
386
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
380
|
-
framework=["PyTorch"],
|
|
387
|
+
framework=["PyTorch", "Sentence Transformers", "safetensors", "Transformers"],
|
|
381
388
|
use_instructions=True,
|
|
382
389
|
public_training_code=None,
|
|
383
390
|
public_training_data=None,
|
|
@@ -147,10 +147,10 @@ class GoogleTextEmbeddingModel(AbsEncoder):
|
|
|
147
147
|
google_text_emb_004 = ModelMeta(
|
|
148
148
|
loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
|
|
149
149
|
loader_kwargs=dict(
|
|
150
|
-
model_name="text-embedding-004",
|
|
151
150
|
model_prompts=MODEL_PROMPTS,
|
|
152
151
|
),
|
|
153
152
|
name="google/text-embedding-004",
|
|
153
|
+
model_type=["dense"],
|
|
154
154
|
languages=["eng-Latn"],
|
|
155
155
|
open_weights=False,
|
|
156
156
|
revision="1", # revision is intended for implementation
|
|
@@ -172,10 +172,10 @@ google_text_emb_004 = ModelMeta(
|
|
|
172
172
|
google_text_emb_005 = ModelMeta(
|
|
173
173
|
loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
|
|
174
174
|
loader_kwargs=dict(
|
|
175
|
-
model_name="text-embedding-005",
|
|
176
175
|
model_prompts=MODEL_PROMPTS,
|
|
177
176
|
),
|
|
178
177
|
name="google/text-embedding-005",
|
|
178
|
+
model_type=["dense"],
|
|
179
179
|
languages=["eng-Latn"],
|
|
180
180
|
open_weights=False,
|
|
181
181
|
revision="1", # revision is intended for implementation
|
|
@@ -197,10 +197,10 @@ google_text_emb_005 = ModelMeta(
|
|
|
197
197
|
google_text_multilingual_emb_002 = ModelMeta(
|
|
198
198
|
loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
|
|
199
199
|
loader_kwargs=dict(
|
|
200
|
-
model_name="text-embedding-002",
|
|
201
200
|
model_prompts=MODEL_PROMPTS,
|
|
202
201
|
),
|
|
203
202
|
name="google/text-multilingual-embedding-002",
|
|
203
|
+
model_type=["dense"],
|
|
204
204
|
languages=MULTILINGUAL_EVALUATED_LANGUAGES, # From the list of evaluated languages in https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text-embeddings-api#supported_text_languages
|
|
205
205
|
open_weights=False,
|
|
206
206
|
revision="1",
|
|
@@ -222,10 +222,10 @@ google_text_multilingual_emb_002 = ModelMeta(
|
|
|
222
222
|
google_gemini_embedding_001 = ModelMeta(
|
|
223
223
|
loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
|
|
224
224
|
loader_kwargs=dict(
|
|
225
|
-
model_name="gemini-embedding-001",
|
|
226
225
|
model_prompts=MODEL_PROMPTS,
|
|
227
226
|
),
|
|
228
227
|
name="google/gemini-embedding-001",
|
|
228
|
+
model_type=["dense"],
|
|
229
229
|
languages=MULTILINGUAL_EVALUATED_LANGUAGES,
|
|
230
230
|
open_weights=False,
|
|
231
231
|
revision="1",
|
|
@@ -260,6 +260,7 @@ def gemma_embedding_loader(model_name: str, revision: str, **kwargs):
|
|
|
260
260
|
embedding_gemma_300m = ModelMeta(
|
|
261
261
|
loader=gemma_embedding_loader,
|
|
262
262
|
name="google/embeddinggemma-300m",
|
|
263
|
+
model_type=["dense"],
|
|
263
264
|
languages=MULTILINGUAL_EVALUATED_LANGUAGES,
|
|
264
265
|
open_weights=True,
|
|
265
266
|
revision="64614b0b8b64f0c6c1e52b07e4e9a4e8fe4d2da2",
|
|
@@ -269,11 +270,21 @@ embedding_gemma_300m = ModelMeta(
|
|
|
269
270
|
max_tokens=2048,
|
|
270
271
|
license="gemma",
|
|
271
272
|
reference="https://ai.google.dev/gemma/docs/embeddinggemma/model_card",
|
|
272
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
273
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
273
274
|
use_instructions=True,
|
|
274
275
|
public_training_code=None,
|
|
275
276
|
public_training_data=None,
|
|
276
277
|
training_datasets=GECKO_TRAINING_DATA,
|
|
277
278
|
similarity_fn_name="cosine",
|
|
278
|
-
memory_usage_mb=
|
|
279
|
+
memory_usage_mb=1155,
|
|
280
|
+
citation="""
|
|
281
|
+
@misc{vera2025embeddinggemmapowerfullightweighttext,
|
|
282
|
+
title={EmbeddingGemma: Powerful and Lightweight Text Representations},
|
|
283
|
+
author={Henrique Schechter Vera and Sahil Dua and Biao Zhang and Daniel Salz and Ryan Mullins and Sindhu Raghuram Panyam and Sara Smoot and Iftekhar Naim and Joe Zou and Feiyang Chen and Daniel Cer and Alice Lisak and Min Choi and Lucas Gonzalez and Omar Sanseviero and Glenn Cameron and Ian Ballantyne and Kat Black and Kaifeng Chen and Weiyi Wang and Zhe Li and Gus Martins and Jinhyuk Lee and Mark Sherwood and Juyeong Ji and Renjie Wu and Jingxiao Zheng and Jyotinder Singh and Abheesht Sharma and Divyashree Sreepathihalli and Aashi Jain and Adham Elarabawy and AJ Co and Andreas Doumanoglou and Babak Samari and Ben Hora and Brian Potetz and Dahun Kim and Enrique Alfonseca and Fedor Moiseev and Feng Han and Frank Palma Gomez and Gustavo Hernández Ábrego and Hesen Zhang and Hui Hui and Jay Han and Karan Gill and Ke Chen and Koert Chen and Madhuri Shanbhogue and Michael Boratko and Paul Suganthan and Sai Meher Karthik Duddu and Sandeep Mariserla and Setareh Ariafar and Shanfeng Zhang and Shijie Zhang and Simon Baumgartner and Sonam Goenka and Steve Qiu and Tanmaya Dabral and Trevor Walker and Vikram Rao and Waleed Khawaja and Wenlei Zhou and Xiaoqi Ren and Ye Xia and Yichang Chen and Yi-Ting Chen and Zhe Dong and Zhongli Ding and Francesco Visin and Gaël Liu and Jiageng Zhang and Kathleen Kenealy and Michelle Casbon and Ravin Kumar and Thomas Mesnard and Zach Gleicher and Cormac Brick and Olivier Lacombe and Adam Roberts and Qin Yin and Yunhsuan Sung and Raphael Hoffmann and Tris Warkentin and Armand Joulin and Tom Duerig and Mojtaba Seyedhosseini},
|
|
284
|
+
year={2025},
|
|
285
|
+
eprint={2509.20354},
|
|
286
|
+
archivePrefix={arXiv},
|
|
287
|
+
primaryClass={cs.CL},
|
|
288
|
+
url={https://arxiv.org/abs/2509.20354},
|
|
289
|
+
}""",
|
|
279
290
|
)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from PIL import Image
|
|
6
7
|
from torch.utils.data import DataLoader
|
|
7
8
|
from tqdm.auto import tqdm
|
|
8
9
|
|
|
@@ -15,6 +16,9 @@ from mteb.types import Array, BatchedInput, PromptType
|
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
18
|
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from PIL import Image
|
|
21
|
+
|
|
18
22
|
|
|
19
23
|
class GraniteVisionEmbeddingWrapper:
|
|
20
24
|
def __init__(
|
|
@@ -162,6 +166,7 @@ granite_vision_embedding = ModelMeta(
|
|
|
162
166
|
torch_dtype=torch.float16,
|
|
163
167
|
),
|
|
164
168
|
name="ibm-granite/granite-vision-3.3-2b-embedding",
|
|
169
|
+
model_type=["dense"],
|
|
165
170
|
languages=["eng-Latn"],
|
|
166
171
|
revision="cee615db64d89d1552a4ee39c50f25c0fc5c66ca",
|
|
167
172
|
release_date="2025-06-11",
|
|
@@ -174,7 +179,7 @@ granite_vision_embedding = ModelMeta(
|
|
|
174
179
|
open_weights=True,
|
|
175
180
|
public_training_code=None,
|
|
176
181
|
public_training_data=None,
|
|
177
|
-
framework=["PyTorch"],
|
|
182
|
+
framework=["PyTorch", "Transformers", "safetensors"],
|
|
178
183
|
reference="https://huggingface.co/ibm-granite/granite-vision-3.3-2b-embedding",
|
|
179
184
|
similarity_fn_name="MaxSim",
|
|
180
185
|
use_instructions=True,
|
|
@@ -38,6 +38,7 @@ gritlm7b = ModelMeta(
|
|
|
38
38
|
torch_dtype="auto",
|
|
39
39
|
),
|
|
40
40
|
name="GritLM/GritLM-7B",
|
|
41
|
+
model_type=["dense"],
|
|
41
42
|
languages=["eng-Latn", "fra-Latn", "deu-Latn", "ita-Latn", "spa-Latn"],
|
|
42
43
|
open_weights=True,
|
|
43
44
|
revision="13f00a0e36500c80ce12870ea513846a066004af",
|
|
@@ -49,7 +50,7 @@ gritlm7b = ModelMeta(
|
|
|
49
50
|
max_tokens=32768,
|
|
50
51
|
reference="https://huggingface.co/GritLM/GritLM-7B",
|
|
51
52
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
52
|
-
framework=["GritLM", "PyTorch"],
|
|
53
|
+
framework=["GritLM", "PyTorch", "Transformers", "safetensors"],
|
|
53
54
|
use_instructions=True,
|
|
54
55
|
training_datasets=GRIT_LM_TRAINING_DATA,
|
|
55
56
|
# section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data
|
|
@@ -66,6 +67,7 @@ gritlm8x7b = ModelMeta(
|
|
|
66
67
|
torch_dtype="auto",
|
|
67
68
|
),
|
|
68
69
|
name="GritLM/GritLM-8x7B",
|
|
70
|
+
model_type=["dense"],
|
|
69
71
|
languages=["eng-Latn", "fra-Latn", "deu-Latn", "ita-Latn", "spa-Latn"],
|
|
70
72
|
open_weights=True,
|
|
71
73
|
revision="7f089b13e3345510281733ca1e6ff871b5b4bc76",
|
|
@@ -77,7 +79,7 @@ gritlm8x7b = ModelMeta(
|
|
|
77
79
|
max_tokens=32768,
|
|
78
80
|
reference="https://huggingface.co/GritLM/GritLM-8x7B",
|
|
79
81
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
80
|
-
framework=["GritLM", "PyTorch"],
|
|
82
|
+
framework=["GritLM", "PyTorch", "Transformers", "safetensors"],
|
|
81
83
|
use_instructions=True,
|
|
82
84
|
training_datasets=GRIT_LM_TRAINING_DATA,
|
|
83
85
|
citation=GRITLM_CITATION,
|