mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +4 -0
- mteb/_create_dataloaders.py +6 -3
- mteb/_evaluators/any_sts_evaluator.py +21 -12
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +1 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
- mteb/_evaluators/pair_classification_evaluator.py +30 -38
- mteb/_evaluators/sklearn_evaluator.py +15 -28
- mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
- mteb/_evaluators/text/summarization_evaluator.py +4 -2
- mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +102 -0
- mteb/abstasks/_statistics_calculation.py +6 -2
- mteb/abstasks/classification.py +0 -2
- mteb/abstasks/clustering.py +1 -1
- mteb/abstasks/clustering_legacy.py +3 -0
- mteb/abstasks/multilabel_classification.py +10 -3
- mteb/abstasks/pair_classification.py +8 -1
- mteb/abstasks/sts.py +7 -0
- mteb/abstasks/task_metadata.py +1 -0
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +74 -15
- mteb/benchmarks/benchmarks/__init__.py +8 -0
- mteb/benchmarks/benchmarks/benchmarks.py +259 -15
- mteb/benchmarks/get_benchmark.py +2 -0
- mteb/cache.py +47 -10
- mteb/deprecated_evaluator.py +8 -13
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/evaluate.py +65 -45
- mteb/leaderboard/app.py +268 -133
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +21 -17
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
- mteb/models/cache_wrappers/cache_wrapper.py +1 -1
- mteb/models/get_model_meta.py +3 -114
- mteb/models/instruct_wrapper.py +5 -1
- mteb/models/model_implementations/align_models.py +7 -0
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +8 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +60 -0
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +11 -0
- mteb/models/model_implementations/blip_models.py +27 -0
- mteb/models/model_implementations/bm25.py +1 -0
- mteb/models/model_implementations/bmretriever_models.py +4 -0
- mteb/models/model_implementations/cadet_models.py +9 -0
- mteb/models/model_implementations/cde_models.py +14 -0
- mteb/models/model_implementations/clip_models.py +3 -0
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +162 -0
- mteb/models/model_implementations/codesage_models.py +15 -0
- mteb/models/model_implementations/cohere_models.py +8 -1
- mteb/models/model_implementations/cohere_v.py +5 -0
- mteb/models/model_implementations/colpali_models.py +14 -6
- mteb/models/model_implementations/colqwen_models.py +271 -1
- mteb/models/model_implementations/colsmol_models.py +2 -0
- mteb/models/model_implementations/conan_models.py +1 -0
- mteb/models/model_implementations/dino_models.py +171 -0
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +12 -101
- mteb/models/model_implementations/e5_v.py +1 -0
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +58 -0
- mteb/models/model_implementations/facebookai.py +193 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +11 -5
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
- mteb/models/model_implementations/gritlm_models.py +2 -0
- mteb/models/model_implementations/gte_models.py +78 -0
- mteb/models/model_implementations/hinvec_models.py +1 -0
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +255 -2
- mteb/models/model_implementations/jina_clip.py +1 -0
- mteb/models/model_implementations/jina_models.py +209 -5
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +31 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +3 -2
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +3 -0
- mteb/models/model_implementations/llm2vec_models.py +8 -0
- mteb/models/model_implementations/mcinext_models.py +3 -0
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +362 -0
- mteb/models/model_implementations/mme5_models.py +1 -0
- mteb/models/model_implementations/moco_models.py +11 -0
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +13 -0
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/mxbai_models.py +9 -0
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
- mteb/models/model_implementations/nomic_models.py +156 -4
- mteb/models/model_implementations/nomic_models_vision.py +7 -2
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
- mteb/models/model_implementations/nvidia_models.py +4 -1
- mteb/models/model_implementations/octen_models.py +195 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +24 -0
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
- mteb/models/model_implementations/ops_moa_models.py +4 -2
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +8 -0
- mteb/models/model_implementations/promptriever_models.py +8 -4
- mteb/models/model_implementations/pylate_models.py +37 -4
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +6 -3
- mteb/models/model_implementations/qzhou_models.py +3 -1
- mteb/models/model_implementations/random_baseline.py +16 -21
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +1 -0
- mteb/models/model_implementations/repllama_models.py +2 -0
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +51 -0
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +3 -0
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +57 -0
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +10 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/ua_sentence_models.py +10 -0
- mteb/models/model_implementations/uae_models.py +1 -0
- mteb/models/model_implementations/vdr_models.py +2 -0
- mteb/models/model_implementations/vi_vn_models.py +39 -0
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +2 -0
- mteb/models/model_implementations/voyage_models.py +15 -0
- mteb/models/model_implementations/voyage_v.py +8 -2
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +442 -22
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
- mteb/models/search_wrappers.py +165 -48
- mteb/models/sentence_transformer_wrapper.py +2 -7
- mteb/results/benchmark_results.py +88 -47
- mteb/results/model_result.py +11 -4
- mteb/results/task_result.py +37 -19
- mteb/similarity_functions.py +49 -0
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +1 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +2 -1
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +22 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/_encoder_io.py +7 -2
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
|
@@ -13,6 +13,7 @@ from .sentence_transformers_models import sent_trf_training_dataset
|
|
|
13
13
|
Haon_Chen__speed_embedding_7b_instruct = ModelMeta(
|
|
14
14
|
loader=sentence_transformers_loader,
|
|
15
15
|
name="Haon-Chen/speed-embedding-7b-instruct",
|
|
16
|
+
model_type=["dense"],
|
|
16
17
|
revision="c167e9a8144b397622ce47b85d9edcdeecef3d3f",
|
|
17
18
|
release_date="2024-10-31",
|
|
18
19
|
languages=["eng-Latn"],
|
|
@@ -31,9 +32,16 @@ Haon_Chen__speed_embedding_7b_instruct = ModelMeta(
|
|
|
31
32
|
training_datasets=None,
|
|
32
33
|
adapted_from="mistralai/Mistral-7B-v0.1",
|
|
33
34
|
superseded_by=None,
|
|
35
|
+
citation="""@article{chen2024little,
|
|
36
|
+
title={Little Giants: Synthesizing High-Quality Embedding Data at Scale},
|
|
37
|
+
author={Chen, Haonan and Wang, Liang and Yang, Nan and Zhu, Yutao and Zhao, Ziliang and Wei, Furu and Dou, Zhicheng},
|
|
38
|
+
journal={arXiv preprint arXiv:2410.18634},
|
|
39
|
+
year={2024}
|
|
40
|
+
}""",
|
|
34
41
|
)
|
|
35
42
|
Gameselo__STS_multilingual_mpnet_base_v2 = ModelMeta(
|
|
36
43
|
name="Gameselo/STS-multilingual-mpnet-base-v2",
|
|
44
|
+
model_type=["dense"],
|
|
37
45
|
revision="449f917af30f590fc31f9ffb226c94f21a2f47b8",
|
|
38
46
|
release_date="2024-06-07",
|
|
39
47
|
languages=[],
|
|
@@ -121,10 +129,20 @@ Gameselo__STS_multilingual_mpnet_base_v2 = ModelMeta(
|
|
|
121
129
|
},
|
|
122
130
|
adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
|
123
131
|
superseded_by=None,
|
|
132
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
133
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
134
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
135
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
136
|
+
month = "11",
|
|
137
|
+
year = "2019",
|
|
138
|
+
publisher = "Association for Computational Linguistics",
|
|
139
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
140
|
+
}""",
|
|
124
141
|
)
|
|
125
142
|
|
|
126
143
|
Hum_Works__lodestone_base_4096_v1 = ModelMeta(
|
|
127
144
|
name="Hum-Works/lodestone-base-4096-v1",
|
|
145
|
+
model_type=["dense"],
|
|
128
146
|
revision="9bbc2d0b57dd2198aea029404b0f976712a7d966",
|
|
129
147
|
release_date="2023-08-25",
|
|
130
148
|
languages=["eng-Latn"],
|
|
@@ -191,6 +209,7 @@ Hum_Works__lodestone_base_4096_v1 = ModelMeta(
|
|
|
191
209
|
)
|
|
192
210
|
Jaume__gemma_2b_embeddings = ModelMeta(
|
|
193
211
|
name="Jaume/gemma-2b-embeddings",
|
|
212
|
+
model_type=["dense"],
|
|
194
213
|
revision="86431f65d7c3f66b2af096c61e614a2958f191f1",
|
|
195
214
|
release_date="2024-06-29",
|
|
196
215
|
languages=[],
|
|
@@ -222,6 +241,7 @@ bilingual_embedding_training_data = {
|
|
|
222
241
|
|
|
223
242
|
Lajavaness__bilingual_embedding_base = ModelMeta(
|
|
224
243
|
name="Lajavaness/bilingual-embedding-base",
|
|
244
|
+
model_type=["dense"],
|
|
225
245
|
revision="0bfc54bb2aa2666dd84715289c7ef58a95eb4d8d",
|
|
226
246
|
release_date="2024-06-26",
|
|
227
247
|
languages=None,
|
|
@@ -244,9 +264,33 @@ Lajavaness__bilingual_embedding_base = ModelMeta(
|
|
|
244
264
|
training_datasets=bilingual_embedding_training_data,
|
|
245
265
|
adapted_from="dangvantuan/bilingual_impl",
|
|
246
266
|
superseded_by=None,
|
|
267
|
+
citation="""
|
|
268
|
+
@article{conneau2019unsupervised,
|
|
269
|
+
title={Unsupervised cross-lingual representation learning at scale},
|
|
270
|
+
author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
|
|
271
|
+
journal={arXiv preprint arXiv:1911.02116},
|
|
272
|
+
year={2019}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
@article{reimers2019sentence,
|
|
276
|
+
title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
|
|
277
|
+
author={Nils Reimers, Iryna Gurevych},
|
|
278
|
+
journal={https://arxiv.org/abs/1908.10084},
|
|
279
|
+
year={2019}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
@article{thakur2020augmented,
|
|
283
|
+
title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
|
|
284
|
+
author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
|
|
285
|
+
journal={arXiv e-prints},
|
|
286
|
+
pages={arXiv--2010},
|
|
287
|
+
year={2020}
|
|
288
|
+
}
|
|
289
|
+
""",
|
|
247
290
|
)
|
|
248
291
|
Lajavaness__bilingual_embedding_large = ModelMeta(
|
|
249
292
|
name="Lajavaness/bilingual-embedding-large",
|
|
293
|
+
model_type=["dense"],
|
|
250
294
|
revision="e83179d7a66e8aed1b3015e98bb5ae234ed89598",
|
|
251
295
|
release_date="2024-06-24",
|
|
252
296
|
languages=["fra-Latn", "eng-Latn"],
|
|
@@ -269,9 +313,33 @@ Lajavaness__bilingual_embedding_large = ModelMeta(
|
|
|
269
313
|
training_datasets=bilingual_embedding_training_data,
|
|
270
314
|
adapted_from="dangvantuan/bilingual_impl",
|
|
271
315
|
superseded_by=None,
|
|
316
|
+
citation="""
|
|
317
|
+
@article{conneau2019unsupervised,
|
|
318
|
+
title={Unsupervised cross-lingual representation learning at scale},
|
|
319
|
+
author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
|
|
320
|
+
journal={arXiv preprint arXiv:1911.02116},
|
|
321
|
+
year={2019}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
@article{reimers2019sentence,
|
|
325
|
+
title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
|
|
326
|
+
author={Nils Reimers, Iryna Gurevych},
|
|
327
|
+
journal={https://arxiv.org/abs/1908.10084},
|
|
328
|
+
year={2019}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
@article{thakur2020augmented,
|
|
332
|
+
title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
|
|
333
|
+
author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
|
|
334
|
+
journal={arXiv e-prints},
|
|
335
|
+
pages={arXiv--2010},
|
|
336
|
+
year={2020}
|
|
337
|
+
}
|
|
338
|
+
""",
|
|
272
339
|
)
|
|
273
340
|
Lajavaness__bilingual_embedding_small = ModelMeta(
|
|
274
341
|
name="Lajavaness/bilingual-embedding-small",
|
|
342
|
+
model_type=["dense"],
|
|
275
343
|
revision="ed4a1dd814de0db81d4a4e287c296a03194463e3",
|
|
276
344
|
release_date="2024-07-17",
|
|
277
345
|
languages=["fra-Latn", "eng-Latn"],
|
|
@@ -294,9 +362,33 @@ Lajavaness__bilingual_embedding_small = ModelMeta(
|
|
|
294
362
|
training_datasets=bilingual_embedding_training_data,
|
|
295
363
|
adapted_from="dangvantuan/bilingual_impl",
|
|
296
364
|
superseded_by=None,
|
|
365
|
+
citation="""
|
|
366
|
+
@article{conneau2019unsupervised,
|
|
367
|
+
title={Unsupervised cross-lingual representation learning at scale},
|
|
368
|
+
author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
|
|
369
|
+
journal={arXiv preprint arXiv:1911.02116},
|
|
370
|
+
year={2019}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
@article{reimers2019sentence,
|
|
374
|
+
title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
|
|
375
|
+
author={Nils Reimers, Iryna Gurevych},
|
|
376
|
+
journal={https://arxiv.org/abs/1908.10084},
|
|
377
|
+
year={2019}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
@article{thakur2020augmented,
|
|
381
|
+
title={Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
|
|
382
|
+
author={Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
|
|
383
|
+
journal={arXiv e-prints},
|
|
384
|
+
pages={arXiv--2010},
|
|
385
|
+
year={2020}
|
|
386
|
+
}
|
|
387
|
+
""",
|
|
297
388
|
)
|
|
298
389
|
Mihaiii__Bulbasaur = ModelMeta(
|
|
299
390
|
name="Mihaiii/Bulbasaur",
|
|
391
|
+
model_type=["dense"],
|
|
300
392
|
revision="6876f839e18ae36224049a41194a431953f08747",
|
|
301
393
|
release_date="2024-04-27",
|
|
302
394
|
languages=None,
|
|
@@ -320,6 +412,7 @@ Mihaiii__Bulbasaur = ModelMeta(
|
|
|
320
412
|
)
|
|
321
413
|
Mihaiii__Ivysaur = ModelMeta(
|
|
322
414
|
name="Mihaiii/Ivysaur",
|
|
415
|
+
model_type=["dense"],
|
|
323
416
|
revision="65914d976f45beb4bda7485c39d88865b4ce6554",
|
|
324
417
|
release_date="2024-04-27",
|
|
325
418
|
languages=None,
|
|
@@ -343,6 +436,7 @@ Mihaiii__Ivysaur = ModelMeta(
|
|
|
343
436
|
)
|
|
344
437
|
Mihaiii__Squirtle = ModelMeta(
|
|
345
438
|
name="Mihaiii/Squirtle",
|
|
439
|
+
model_type=["dense"],
|
|
346
440
|
revision="5b991da48a9286637a256d4a35aab87a1a57b78a",
|
|
347
441
|
release_date="2024-04-30",
|
|
348
442
|
languages=None,
|
|
@@ -366,6 +460,7 @@ Mihaiii__Squirtle = ModelMeta(
|
|
|
366
460
|
)
|
|
367
461
|
Mihaiii__Venusaur = ModelMeta(
|
|
368
462
|
name="Mihaiii/Venusaur",
|
|
463
|
+
model_type=["dense"],
|
|
369
464
|
revision="0dc817f0addbb7bab8feeeeaded538f9ffeb3419",
|
|
370
465
|
release_date="2024-04-29",
|
|
371
466
|
languages=None,
|
|
@@ -389,6 +484,7 @@ Mihaiii__Venusaur = ModelMeta(
|
|
|
389
484
|
)
|
|
390
485
|
Mihaiii__Wartortle = ModelMeta(
|
|
391
486
|
name="Mihaiii/Wartortle",
|
|
487
|
+
model_type=["dense"],
|
|
392
488
|
revision="14caca5253414d38a7d28b62d1b7c30ef3293a87",
|
|
393
489
|
release_date="2024-04-30",
|
|
394
490
|
languages=None,
|
|
@@ -412,6 +508,7 @@ Mihaiii__Wartortle = ModelMeta(
|
|
|
412
508
|
)
|
|
413
509
|
Mihaiii__gte_micro = ModelMeta(
|
|
414
510
|
name="Mihaiii/gte-micro",
|
|
511
|
+
model_type=["dense"],
|
|
415
512
|
revision="6fd2397cb9dfa7c901aedf9a2a44d3c888ccafdd",
|
|
416
513
|
release_date="2024-04-21",
|
|
417
514
|
languages=None,
|
|
@@ -434,6 +531,7 @@ Mihaiii__gte_micro = ModelMeta(
|
|
|
434
531
|
)
|
|
435
532
|
Mihaiii__gte_micro_v4 = ModelMeta(
|
|
436
533
|
name="Mihaiii/gte-micro-v4",
|
|
534
|
+
model_type=["dense"],
|
|
437
535
|
revision="78e1a4b348f8524c3ab2e3e3475788f5adb8c98f",
|
|
438
536
|
release_date="2024-04-22",
|
|
439
537
|
languages=None,
|
|
@@ -456,6 +554,7 @@ Mihaiii__gte_micro_v4 = ModelMeta(
|
|
|
456
554
|
)
|
|
457
555
|
OrdalieTech__Solon_embeddings_large_0_1 = ModelMeta(
|
|
458
556
|
name="OrdalieTech/Solon-embeddings-large-0.1",
|
|
557
|
+
model_type=["dense"],
|
|
459
558
|
revision="9f6465f6ea2f6d10c6294bc15d84edf87d47cdef",
|
|
460
559
|
release_date="2023-12-09",
|
|
461
560
|
languages=["fra-Latn"],
|
|
@@ -478,6 +577,7 @@ OrdalieTech__Solon_embeddings_large_0_1 = ModelMeta(
|
|
|
478
577
|
)
|
|
479
578
|
Omartificial_Intelligence_Space__Arabert_all_nli_triplet_Matryoshka = ModelMeta(
|
|
480
579
|
name="Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka",
|
|
580
|
+
model_type=["dense"],
|
|
481
581
|
revision="d0361a36f6fe69febfc8550d0918abab174f6f30",
|
|
482
582
|
release_date="2024-06-16",
|
|
483
583
|
languages=["ara-Arab"],
|
|
@@ -497,9 +597,19 @@ Omartificial_Intelligence_Space__Arabert_all_nli_triplet_Matryoshka = ModelMeta(
|
|
|
497
597
|
training_datasets=set(), # not in MTEB: {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
|
|
498
598
|
adapted_from="aubmindlab/bert-base-arabertv02",
|
|
499
599
|
superseded_by=None,
|
|
600
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
601
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
602
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
603
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
604
|
+
month = "11",
|
|
605
|
+
year = "2019",
|
|
606
|
+
publisher = "Association for Computational Linguistics",
|
|
607
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
608
|
+
}""",
|
|
500
609
|
)
|
|
501
610
|
Omartificial_Intelligence_Space__Arabic_MiniLM_L12_v2_all_nli_triplet = ModelMeta(
|
|
502
611
|
name="Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet",
|
|
612
|
+
model_type=["dense"],
|
|
503
613
|
revision="6916465c43b984e955aa6dc72851474f0128f428",
|
|
504
614
|
release_date="2024-06-25",
|
|
505
615
|
languages=["ara-Arab"],
|
|
@@ -524,6 +634,7 @@ Omartificial_Intelligence_Space__Arabic_MiniLM_L12_v2_all_nli_triplet = ModelMet
|
|
|
524
634
|
)
|
|
525
635
|
Omartificial_Intelligence_Space__Arabic_all_nli_triplet_Matryoshka = ModelMeta(
|
|
526
636
|
name="Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka",
|
|
637
|
+
model_type=["dense"],
|
|
527
638
|
revision="1ca467cc576bd76666a4d21b24ee43afa914dd10",
|
|
528
639
|
release_date="2024-06-14",
|
|
529
640
|
languages=["ara-Arab"],
|
|
@@ -545,9 +656,19 @@ Omartificial_Intelligence_Space__Arabic_all_nli_triplet_Matryoshka = ModelMeta(
|
|
|
545
656
|
# {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
|
|
546
657
|
adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
|
547
658
|
superseded_by=None,
|
|
659
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
660
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
661
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
662
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
663
|
+
month = "11",
|
|
664
|
+
year = "2019",
|
|
665
|
+
publisher = "Association for Computational Linguistics",
|
|
666
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
667
|
+
}""",
|
|
548
668
|
)
|
|
549
669
|
Omartificial_Intelligence_Space__Arabic_labse_Matryoshka = ModelMeta(
|
|
550
670
|
name="Omartificial-Intelligence-Space/Arabic-labse-Matryoshka",
|
|
671
|
+
model_type=["dense"],
|
|
551
672
|
revision="ee6d5e33c78ed582ade47fd452a74ea52aa5bfe2",
|
|
552
673
|
release_date="2024-06-16",
|
|
553
674
|
languages=["ara-Arab"],
|
|
@@ -569,9 +690,19 @@ Omartificial_Intelligence_Space__Arabic_labse_Matryoshka = ModelMeta(
|
|
|
569
690
|
# {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
|
|
570
691
|
adapted_from="sentence-transformers/LaBSE",
|
|
571
692
|
superseded_by=None,
|
|
693
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
694
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
695
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
696
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
697
|
+
month = "11",
|
|
698
|
+
year = "2019",
|
|
699
|
+
publisher = "Association for Computational Linguistics",
|
|
700
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
701
|
+
}""",
|
|
572
702
|
)
|
|
573
703
|
Omartificial_Intelligence_Space__Arabic_mpnet_base_all_nli_triplet = ModelMeta(
|
|
574
704
|
name="Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet",
|
|
705
|
+
model_type=["dense"],
|
|
575
706
|
revision="2628cb641e040f44328195fadcdfb58e6d5cffa7",
|
|
576
707
|
release_date="2024-06-15",
|
|
577
708
|
languages=["ara-Arab"],
|
|
@@ -593,9 +724,19 @@ Omartificial_Intelligence_Space__Arabic_mpnet_base_all_nli_triplet = ModelMeta(
|
|
|
593
724
|
# {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
|
|
594
725
|
adapted_from="tomaarsen/mpnet-base-all-nli-triplet",
|
|
595
726
|
superseded_by=None,
|
|
727
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
728
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
729
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
730
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
731
|
+
month = "11",
|
|
732
|
+
year = "2019",
|
|
733
|
+
publisher = "Association for Computational Linguistics",
|
|
734
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
735
|
+
}""",
|
|
596
736
|
)
|
|
597
737
|
Omartificial_Intelligence_Space__Marbert_all_nli_triplet_Matryoshka = ModelMeta(
|
|
598
738
|
name="Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka",
|
|
739
|
+
model_type=["dense"],
|
|
599
740
|
revision="ecf3274e164f057c4a3dd70691cae0265d87a9d0",
|
|
600
741
|
release_date="2024-06-17",
|
|
601
742
|
languages=["ara-Arab"],
|
|
@@ -615,9 +756,19 @@ Omartificial_Intelligence_Space__Marbert_all_nli_triplet_Matryoshka = ModelMeta(
|
|
|
615
756
|
training_datasets=set(), # not in MTEB: "Omartificial-Intelligence-Space/Arabic-NLi-Triplet"},
|
|
616
757
|
adapted_from="UBC-NLP/MARBERTv2",
|
|
617
758
|
superseded_by=None,
|
|
759
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
760
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
761
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
762
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
763
|
+
month = "11",
|
|
764
|
+
year = "2019",
|
|
765
|
+
publisher = "Association for Computational Linguistics",
|
|
766
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
767
|
+
}""",
|
|
618
768
|
)
|
|
619
769
|
consciousai__cai_lunaris_text_embeddings = ModelMeta(
|
|
620
770
|
name="consciousAI/cai-lunaris-text-embeddings",
|
|
771
|
+
model_type=["dense"],
|
|
621
772
|
revision="8332c464d13505968ff7a6e2213f36fd8730b4c7",
|
|
622
773
|
release_date="2023-06-22",
|
|
623
774
|
languages=None,
|
|
@@ -640,6 +791,7 @@ consciousai__cai_lunaris_text_embeddings = ModelMeta(
|
|
|
640
791
|
)
|
|
641
792
|
consciousai__cai_stellaris_text_embeddings = ModelMeta(
|
|
642
793
|
name="consciousAI/cai-stellaris-text-embeddings",
|
|
794
|
+
model_type=["dense"],
|
|
643
795
|
revision="c000ec4b29588daf0f4a0b2ad4e72ee807d8efc0",
|
|
644
796
|
release_date="2023-06-23",
|
|
645
797
|
languages=None,
|
|
@@ -671,6 +823,7 @@ SENTENCE_CROISSANT_TRAINING_DATA = {
|
|
|
671
823
|
}
|
|
672
824
|
manu__sentence_croissant_alpha_v0_2 = ModelMeta(
|
|
673
825
|
name="manu/sentence_croissant_alpha_v0.2",
|
|
826
|
+
model_type=["dense"],
|
|
674
827
|
revision="4610b8cea65d7dd59e0b04af50753933fe5b29b2",
|
|
675
828
|
release_date="2024-03-15",
|
|
676
829
|
languages=None,
|
|
@@ -693,6 +846,7 @@ manu__sentence_croissant_alpha_v0_2 = ModelMeta(
|
|
|
693
846
|
)
|
|
694
847
|
manu__sentence_croissant_alpha_v0_3 = ModelMeta(
|
|
695
848
|
name="manu/sentence_croissant_alpha_v0.3",
|
|
849
|
+
model_type=["dense"],
|
|
696
850
|
revision="4ac16754f3d81aba76cc32955dc9ee4122df96eb",
|
|
697
851
|
release_date="2024-04-26",
|
|
698
852
|
languages=None,
|
|
@@ -715,6 +869,7 @@ manu__sentence_croissant_alpha_v0_3 = ModelMeta(
|
|
|
715
869
|
)
|
|
716
870
|
manu__sentence_croissant_alpha_v0_4 = ModelMeta(
|
|
717
871
|
name="manu/sentence_croissant_alpha_v0.4",
|
|
872
|
+
model_type=["dense"],
|
|
718
873
|
revision="0ce6372e6a3c21134dcf26dcde13cca869c767fc",
|
|
719
874
|
release_date="2024-04-27",
|
|
720
875
|
languages=["fra-Latn", "eng-Latn"],
|
|
@@ -738,6 +893,7 @@ manu__sentence_croissant_alpha_v0_4 = ModelMeta(
|
|
|
738
893
|
)
|
|
739
894
|
thenlper__gte_base = ModelMeta(
|
|
740
895
|
name="thenlper/gte-base",
|
|
896
|
+
model_type=["dense"],
|
|
741
897
|
revision="c078288308d8dee004ab72c6191778064285ec0c",
|
|
742
898
|
release_date="2023-07-27",
|
|
743
899
|
languages=["eng-Latn"],
|
|
@@ -757,9 +913,16 @@ thenlper__gte_base = ModelMeta(
|
|
|
757
913
|
training_datasets=None,
|
|
758
914
|
adapted_from=None,
|
|
759
915
|
superseded_by=None,
|
|
916
|
+
citation="""@article{li2023towards,
|
|
917
|
+
title={Towards general text embeddings with multi-stage contrastive learning},
|
|
918
|
+
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
|
|
919
|
+
journal={arXiv preprint arXiv:2308.03281},
|
|
920
|
+
year={2023}
|
|
921
|
+
}""",
|
|
760
922
|
)
|
|
761
923
|
thenlper__gte_large = ModelMeta(
|
|
762
924
|
name="thenlper/gte-large",
|
|
925
|
+
model_type=["dense"],
|
|
763
926
|
revision="4bef63f39fcc5e2d6b0aae83089f307af4970164",
|
|
764
927
|
release_date="2023-07-27",
|
|
765
928
|
languages=["eng-Latn"],
|
|
@@ -779,9 +942,16 @@ thenlper__gte_large = ModelMeta(
|
|
|
779
942
|
training_datasets=None,
|
|
780
943
|
adapted_from=None,
|
|
781
944
|
superseded_by=None,
|
|
945
|
+
citation="""@article{li2023towards,
|
|
946
|
+
title={Towards general text embeddings with multi-stage contrastive learning},
|
|
947
|
+
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
|
|
948
|
+
journal={arXiv preprint arXiv:2308.03281},
|
|
949
|
+
year={2023}
|
|
950
|
+
}""",
|
|
782
951
|
)
|
|
783
952
|
thenlper__gte_small = ModelMeta(
|
|
784
953
|
name="thenlper/gte-small",
|
|
954
|
+
model_type=["dense"],
|
|
785
955
|
revision="17e1f347d17fe144873b1201da91788898c639cd",
|
|
786
956
|
release_date="2023-07-27",
|
|
787
957
|
languages=["eng-Latn"],
|
|
@@ -801,9 +971,16 @@ thenlper__gte_small = ModelMeta(
|
|
|
801
971
|
training_datasets=None,
|
|
802
972
|
adapted_from=None,
|
|
803
973
|
superseded_by=None,
|
|
974
|
+
citation="""@article{li2023towards,
|
|
975
|
+
title={Towards general text embeddings with multi-stage contrastive learning},
|
|
976
|
+
author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},
|
|
977
|
+
journal={arXiv preprint arXiv:2308.03281},
|
|
978
|
+
year={2023}
|
|
979
|
+
}""",
|
|
804
980
|
)
|
|
805
981
|
OrlikB__KartonBERT_USE_base_v1 = ModelMeta(
|
|
806
982
|
name="OrlikB/KartonBERT-USE-base-v1",
|
|
983
|
+
model_type=["dense"],
|
|
807
984
|
revision="1f59dd58fe57995c0e867d5e29f03763eae99645",
|
|
808
985
|
release_date="2024-09-30",
|
|
809
986
|
languages=["pol-Latn"],
|
|
@@ -826,6 +1003,7 @@ OrlikB__KartonBERT_USE_base_v1 = ModelMeta(
|
|
|
826
1003
|
)
|
|
827
1004
|
OrlikB__st_polish_kartonberta_base_alpha_v1 = ModelMeta(
|
|
828
1005
|
name="OrlikB/st-polish-kartonberta-base-alpha-v1",
|
|
1006
|
+
model_type=["dense"],
|
|
829
1007
|
revision="5590a0e2d7bb43674e44d7076b3ff157f7d4a1cb",
|
|
830
1008
|
release_date="2023-11-12",
|
|
831
1009
|
languages=["pol-Latn"],
|
|
@@ -848,6 +1026,7 @@ OrlikB__st_polish_kartonberta_base_alpha_v1 = ModelMeta(
|
|
|
848
1026
|
)
|
|
849
1027
|
sdadas__mmlw_e5_base = ModelMeta(
|
|
850
1028
|
name="sdadas/mmlw-e5-base",
|
|
1029
|
+
model_type=["dense"],
|
|
851
1030
|
revision="f10628ed55b5ec400502aff439bd714a6da0af30",
|
|
852
1031
|
release_date="2023-11-17",
|
|
853
1032
|
languages=["pol-Latn"],
|
|
@@ -867,9 +1046,18 @@ sdadas__mmlw_e5_base = ModelMeta(
|
|
|
867
1046
|
training_datasets=E5_TRAINING_DATA,
|
|
868
1047
|
adapted_from="intfloat/multilingual-e5-base",
|
|
869
1048
|
superseded_by=None,
|
|
1049
|
+
citation="""@article{dadas2024pirb,
|
|
1050
|
+
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
|
|
1051
|
+
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
|
|
1052
|
+
year={2024},
|
|
1053
|
+
eprint={2402.13350},
|
|
1054
|
+
archivePrefix={arXiv},
|
|
1055
|
+
primaryClass={cs.CL}
|
|
1056
|
+
}""",
|
|
870
1057
|
)
|
|
871
1058
|
dwzhu__e5_base_4k = ModelMeta(
|
|
872
1059
|
name="dwzhu/e5-base-4k",
|
|
1060
|
+
model_type=["dense"],
|
|
873
1061
|
revision="1b5664b8cb2bccd8c309429b7bfe5864402e8fbc",
|
|
874
1062
|
release_date="2024-03-28",
|
|
875
1063
|
languages=["eng-Latn"],
|
|
@@ -889,9 +1077,16 @@ dwzhu__e5_base_4k = ModelMeta(
|
|
|
889
1077
|
training_datasets=E5_TRAINING_DATA,
|
|
890
1078
|
adapted_from="intfloat/e5-base-v2",
|
|
891
1079
|
superseded_by=None,
|
|
1080
|
+
citation="""@article{zhu2024longembed,
|
|
1081
|
+
title={LongEmbed: Extending Embedding Models for Long Context Retrieval},
|
|
1082
|
+
author={Zhu, Dawei and Wang, Liang and Yang, Nan and Song, Yifan and Wu, Wenhao and Wei, Furu and Li, Sujian},
|
|
1083
|
+
journal={arXiv preprint arXiv:2404.12096},
|
|
1084
|
+
year={2024}
|
|
1085
|
+
}""",
|
|
892
1086
|
)
|
|
893
1087
|
sdadas__mmlw_e5_large = ModelMeta(
|
|
894
1088
|
name="sdadas/mmlw-e5-large",
|
|
1089
|
+
model_type=["dense"],
|
|
895
1090
|
revision="5c143fb045ebed664fd85b43fc45155999eb110f",
|
|
896
1091
|
release_date="2023-11-17",
|
|
897
1092
|
languages=["pol-Latn"],
|
|
@@ -911,9 +1106,18 @@ sdadas__mmlw_e5_large = ModelMeta(
|
|
|
911
1106
|
training_datasets=E5_TRAINING_DATA,
|
|
912
1107
|
adapted_from="intfloat/multilingual-e5-large",
|
|
913
1108
|
superseded_by=None,
|
|
1109
|
+
citation="""@article{dadas2024pirb,
|
|
1110
|
+
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
|
|
1111
|
+
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
|
|
1112
|
+
year={2024},
|
|
1113
|
+
eprint={2402.13350},
|
|
1114
|
+
archivePrefix={arXiv},
|
|
1115
|
+
primaryClass={cs.CL}
|
|
1116
|
+
}""",
|
|
914
1117
|
)
|
|
915
1118
|
sdadas__mmlw_e5_small = ModelMeta(
|
|
916
1119
|
name="sdadas/mmlw-e5-small",
|
|
1120
|
+
model_type=["dense"],
|
|
917
1121
|
revision="ff1298cb6d997f18b794d2f3d73cad2ba2ad739a",
|
|
918
1122
|
release_date="2023-11-17",
|
|
919
1123
|
languages=["pol-Latn"],
|
|
@@ -933,9 +1137,18 @@ sdadas__mmlw_e5_small = ModelMeta(
|
|
|
933
1137
|
training_datasets=E5_TRAINING_DATA,
|
|
934
1138
|
adapted_from="intfloat/multilingual-e5-small",
|
|
935
1139
|
superseded_by=None,
|
|
1140
|
+
citation="""@article{dadas2024pirb,
|
|
1141
|
+
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
|
|
1142
|
+
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
|
|
1143
|
+
year={2024},
|
|
1144
|
+
eprint={2402.13350},
|
|
1145
|
+
archivePrefix={arXiv},
|
|
1146
|
+
primaryClass={cs.CL}
|
|
1147
|
+
}""",
|
|
936
1148
|
)
|
|
937
1149
|
sdadas__mmlw_roberta_base = ModelMeta(
|
|
938
1150
|
name="sdadas/mmlw-roberta-base",
|
|
1151
|
+
model_type=["dense"],
|
|
939
1152
|
revision="0ac7f23f6c96af601fa6a17852bd08d5136d6365",
|
|
940
1153
|
release_date="2023-11-17",
|
|
941
1154
|
languages=["pol-Latn"],
|
|
@@ -955,9 +1168,18 @@ sdadas__mmlw_roberta_base = ModelMeta(
|
|
|
955
1168
|
training_datasets={"MSMARCO"},
|
|
956
1169
|
adapted_from="sdadas/polish-roberta-base-v2",
|
|
957
1170
|
superseded_by=None,
|
|
1171
|
+
citation="""@article{dadas2024pirb,
|
|
1172
|
+
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
|
|
1173
|
+
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
|
|
1174
|
+
year={2024},
|
|
1175
|
+
eprint={2402.13350},
|
|
1176
|
+
archivePrefix={arXiv},
|
|
1177
|
+
primaryClass={cs.CL}
|
|
1178
|
+
}""",
|
|
958
1179
|
)
|
|
959
1180
|
sdadas__mmlw_roberta_large = ModelMeta(
|
|
960
1181
|
name="sdadas/mmlw-roberta-large",
|
|
1182
|
+
model_type=["dense"],
|
|
961
1183
|
revision="b8058066a8de32d0737b3cd82d8b4f4108745af9",
|
|
962
1184
|
release_date="2023-11-17",
|
|
963
1185
|
languages=["pol-Latn"],
|
|
@@ -977,6 +1199,14 @@ sdadas__mmlw_roberta_large = ModelMeta(
|
|
|
977
1199
|
training_datasets={"MSMARCO"},
|
|
978
1200
|
adapted_from="sdadas/polish-roberta-large-v2",
|
|
979
1201
|
superseded_by=None,
|
|
1202
|
+
citation="""@article{dadas2024pirb,
|
|
1203
|
+
title={{PIRB}: A Comprehensive Benchmark of Polish Dense and Hybrid Text Retrieval Methods},
|
|
1204
|
+
author={Sławomir Dadas and Michał Perełkiewicz and Rafał Poświata},
|
|
1205
|
+
year={2024},
|
|
1206
|
+
eprint={2402.13350},
|
|
1207
|
+
archivePrefix={arXiv},
|
|
1208
|
+
primaryClass={cs.CL}
|
|
1209
|
+
}""",
|
|
980
1210
|
)
|
|
981
1211
|
|
|
982
1212
|
udever_dataset = { # discussed here: https://github.com/embeddings-benchmark/mteb/issues/2193
|
|
@@ -1035,6 +1265,7 @@ udever_languages = [
|
|
|
1035
1265
|
|
|
1036
1266
|
izhx__udever_bloom_1b1 = ModelMeta(
|
|
1037
1267
|
name="izhx/udever-bloom-1b1",
|
|
1268
|
+
model_type=["dense"],
|
|
1038
1269
|
revision="7bf1ee29878cb040b2708a691aa4b61f27eaa252",
|
|
1039
1270
|
release_date="2023-10-24",
|
|
1040
1271
|
languages=udever_languages,
|
|
@@ -1054,9 +1285,16 @@ izhx__udever_bloom_1b1 = ModelMeta(
|
|
|
1054
1285
|
training_datasets=udever_dataset,
|
|
1055
1286
|
adapted_from="bigscience/bloom-1b1",
|
|
1056
1287
|
superseded_by=None,
|
|
1288
|
+
citation="""@article{zhang2023language,
|
|
1289
|
+
title={Language Models are Universal Embedders},
|
|
1290
|
+
author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
|
|
1291
|
+
journal={arXiv preprint arXiv:2310.08232},
|
|
1292
|
+
year={2023}
|
|
1293
|
+
}""",
|
|
1057
1294
|
)
|
|
1058
1295
|
izhx__udever_bloom_3b = ModelMeta(
|
|
1059
1296
|
name="izhx/udever-bloom-3b",
|
|
1297
|
+
model_type=["dense"],
|
|
1060
1298
|
revision="4edd8affe80ca89ba0f6b6ba4103fc7f25fc57b2",
|
|
1061
1299
|
release_date="2023-10-24",
|
|
1062
1300
|
languages=udever_languages,
|
|
@@ -1076,9 +1314,16 @@ izhx__udever_bloom_3b = ModelMeta(
|
|
|
1076
1314
|
training_datasets=udever_dataset,
|
|
1077
1315
|
adapted_from="bigscience/bloom-3b",
|
|
1078
1316
|
superseded_by=None,
|
|
1317
|
+
citation="""@article{zhang2023language,
|
|
1318
|
+
title={Language Models are Universal Embedders},
|
|
1319
|
+
author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
|
|
1320
|
+
journal={arXiv preprint arXiv:2310.08232},
|
|
1321
|
+
year={2023}
|
|
1322
|
+
}""",
|
|
1079
1323
|
)
|
|
1080
1324
|
izhx__udever_bloom_560m = ModelMeta(
|
|
1081
1325
|
name="izhx/udever-bloom-560m",
|
|
1326
|
+
model_type=["dense"],
|
|
1082
1327
|
revision="b2a723e355946ec5a5c5fbed3459766627ded2bb",
|
|
1083
1328
|
release_date="2023-10-24",
|
|
1084
1329
|
languages=udever_languages,
|
|
@@ -1098,9 +1343,16 @@ izhx__udever_bloom_560m = ModelMeta(
|
|
|
1098
1343
|
training_datasets=udever_dataset,
|
|
1099
1344
|
adapted_from="bigscience/bloom-560m",
|
|
1100
1345
|
superseded_by=None,
|
|
1346
|
+
citation="""@article{zhang2023language,
|
|
1347
|
+
title={Language Models are Universal Embedders},
|
|
1348
|
+
author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
|
|
1349
|
+
journal={arXiv preprint arXiv:2310.08232},
|
|
1350
|
+
year={2023}
|
|
1351
|
+
}""",
|
|
1101
1352
|
)
|
|
1102
1353
|
izhx__udever_bloom_7b1 = ModelMeta(
|
|
1103
1354
|
name="izhx/udever-bloom-7b1",
|
|
1355
|
+
model_type=["dense"],
|
|
1104
1356
|
revision="18e8d3e6dbd94868584877f2e72a105a17df22ef",
|
|
1105
1357
|
release_date="2023-10-24",
|
|
1106
1358
|
languages=udever_languages,
|
|
@@ -1120,9 +1372,16 @@ izhx__udever_bloom_7b1 = ModelMeta(
|
|
|
1120
1372
|
training_datasets=udever_dataset,
|
|
1121
1373
|
adapted_from="bigscience/bloom-7b1",
|
|
1122
1374
|
superseded_by=None,
|
|
1375
|
+
citation="""@article{zhang2023language,
|
|
1376
|
+
title={Language Models are Universal Embedders},
|
|
1377
|
+
author={Zhang, Xin and Li, Zehan and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan and Zhang, Min},
|
|
1378
|
+
journal={arXiv preprint arXiv:2310.08232},
|
|
1379
|
+
year={2023}
|
|
1380
|
+
}""",
|
|
1123
1381
|
)
|
|
1124
1382
|
avsolatorio__gist_embedding_v0 = ModelMeta(
|
|
1125
1383
|
name="avsolatorio/GIST-Embedding-v0",
|
|
1384
|
+
model_type=["dense"],
|
|
1126
1385
|
revision="bf6b2e55e92f510a570ad4d7d2da2ec8cd22590c",
|
|
1127
1386
|
release_date="2024-01-31",
|
|
1128
1387
|
languages=["eng-Latn"],
|
|
@@ -1159,9 +1418,20 @@ avsolatorio__gist_embedding_v0 = ModelMeta(
|
|
|
1159
1418
|
| bge_training_data,
|
|
1160
1419
|
adapted_from="BAAI/bge-large-en-v1.5",
|
|
1161
1420
|
superseded_by=None,
|
|
1421
|
+
citation="""@article{solatorio2024gistembed,
|
|
1422
|
+
title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
|
|
1423
|
+
author={Aivin V. Solatorio},
|
|
1424
|
+
journal={arXiv preprint arXiv:2402.16829},
|
|
1425
|
+
year={2024},
|
|
1426
|
+
URL={https://arxiv.org/abs/2402.16829}
|
|
1427
|
+
eprint={2402.16829},
|
|
1428
|
+
archivePrefix={arXiv},
|
|
1429
|
+
primaryClass={cs.LG}
|
|
1430
|
+
}""",
|
|
1162
1431
|
)
|
|
1163
1432
|
avsolatorio__gist_all_minilm_l6_v2 = ModelMeta(
|
|
1164
1433
|
name="avsolatorio/GIST-all-MiniLM-L6-v2",
|
|
1434
|
+
model_type=["dense"],
|
|
1165
1435
|
revision="ea89dfad053bba14677bb784a4269898abbdce44",
|
|
1166
1436
|
release_date="2024-02-03",
|
|
1167
1437
|
languages=["eng-Latn"],
|
|
@@ -1198,9 +1468,20 @@ avsolatorio__gist_all_minilm_l6_v2 = ModelMeta(
|
|
|
1198
1468
|
| bge_training_data,
|
|
1199
1469
|
adapted_from=None,
|
|
1200
1470
|
superseded_by=None,
|
|
1471
|
+
citation="""@article{solatorio2024gistembed,
|
|
1472
|
+
title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
|
|
1473
|
+
author={Aivin V. Solatorio},
|
|
1474
|
+
journal={arXiv preprint arXiv:2402.16829},
|
|
1475
|
+
year={2024},
|
|
1476
|
+
URL={https://arxiv.org/abs/2402.16829}
|
|
1477
|
+
eprint={2402.16829},
|
|
1478
|
+
archivePrefix={arXiv},
|
|
1479
|
+
primaryClass={cs.LG}
|
|
1480
|
+
}""",
|
|
1201
1481
|
)
|
|
1202
1482
|
avsolatorio__gist_large_embedding_v0 = ModelMeta(
|
|
1203
1483
|
name="avsolatorio/GIST-large-Embedding-v0",
|
|
1484
|
+
model_type=["dense"],
|
|
1204
1485
|
revision="7831200e2f7819b994490c091cf3258a2b821f0c",
|
|
1205
1486
|
release_date="2024-02-14",
|
|
1206
1487
|
languages=["eng-Latn"],
|
|
@@ -1237,9 +1518,20 @@ avsolatorio__gist_large_embedding_v0 = ModelMeta(
|
|
|
1237
1518
|
| bge_training_data,
|
|
1238
1519
|
adapted_from=None,
|
|
1239
1520
|
superseded_by=None,
|
|
1521
|
+
citation="""@article{solatorio2024gistembed,
|
|
1522
|
+
title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
|
|
1523
|
+
author={Aivin V. Solatorio},
|
|
1524
|
+
journal={arXiv preprint arXiv:2402.16829},
|
|
1525
|
+
year={2024},
|
|
1526
|
+
URL={https://arxiv.org/abs/2402.16829}
|
|
1527
|
+
eprint={2402.16829},
|
|
1528
|
+
archivePrefix={arXiv},
|
|
1529
|
+
primaryClass={cs.LG}
|
|
1530
|
+
}""",
|
|
1240
1531
|
)
|
|
1241
1532
|
avsolatorio__gist_small_embedding_v0 = ModelMeta(
|
|
1242
1533
|
name="avsolatorio/GIST-small-Embedding-v0",
|
|
1534
|
+
model_type=["dense"],
|
|
1243
1535
|
revision="d6c4190f9e01b9994dc7cac99cf2f2b85cfb57bc",
|
|
1244
1536
|
release_date="2024-02-03",
|
|
1245
1537
|
languages=["eng-Latn"],
|
|
@@ -1276,9 +1568,20 @@ avsolatorio__gist_small_embedding_v0 = ModelMeta(
|
|
|
1276
1568
|
| bge_training_data,
|
|
1277
1569
|
adapted_from=None,
|
|
1278
1570
|
superseded_by=None,
|
|
1571
|
+
citation="""@article{solatorio2024gistembed,
|
|
1572
|
+
title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
|
|
1573
|
+
author={Aivin V. Solatorio},
|
|
1574
|
+
journal={arXiv preprint arXiv:2402.16829},
|
|
1575
|
+
year={2024},
|
|
1576
|
+
URL={https://arxiv.org/abs/2402.16829}
|
|
1577
|
+
eprint={2402.16829},
|
|
1578
|
+
archivePrefix={arXiv},
|
|
1579
|
+
primaryClass={cs.LG}
|
|
1580
|
+
}""",
|
|
1279
1581
|
)
|
|
1280
1582
|
bigscience__sgpt_bloom_7b1_msmarco = ModelMeta(
|
|
1281
1583
|
name="bigscience/sgpt-bloom-7b1-msmarco",
|
|
1584
|
+
model_type=["dense"],
|
|
1282
1585
|
revision="dc579f3d2d5a0795eba2049e16c3e36c74007ad3",
|
|
1283
1586
|
release_date="2022-08-26",
|
|
1284
1587
|
languages=None,
|
|
@@ -1298,9 +1601,16 @@ bigscience__sgpt_bloom_7b1_msmarco = ModelMeta(
|
|
|
1298
1601
|
training_datasets=None,
|
|
1299
1602
|
adapted_from="/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3/bloom-7b1",
|
|
1300
1603
|
superseded_by=None,
|
|
1604
|
+
citation="""@article{muennighoff2022sgpt,
|
|
1605
|
+
title={SGPT: GPT Sentence Embeddings for Semantic Search},
|
|
1606
|
+
author={Muennighoff, Niklas},
|
|
1607
|
+
journal={arXiv preprint arXiv:2202.08904},
|
|
1608
|
+
year={2022}
|
|
1609
|
+
}""",
|
|
1301
1610
|
)
|
|
1302
1611
|
aari1995__german_semantic_sts_v2 = ModelMeta(
|
|
1303
1612
|
name="aari1995/German_Semantic_STS_V2",
|
|
1613
|
+
model_type=["dense"],
|
|
1304
1614
|
revision="22912542b0ec7a7ef369837e28ffe6352a27afc9",
|
|
1305
1615
|
release_date="2022-11-17",
|
|
1306
1616
|
languages=["deu-Latn"],
|
|
@@ -1324,6 +1634,7 @@ aari1995__german_semantic_sts_v2 = ModelMeta(
|
|
|
1324
1634
|
)
|
|
1325
1635
|
abhinand__medembed_small_v0_1 = ModelMeta(
|
|
1326
1636
|
name="abhinand/MedEmbed-small-v0.1",
|
|
1637
|
+
model_type=["dense"],
|
|
1327
1638
|
revision="40a5850d046cfdb56154e332b4d7099b63e8d50e",
|
|
1328
1639
|
release_date="2024-10-20",
|
|
1329
1640
|
languages=["eng-Latn"],
|
|
@@ -1352,9 +1663,16 @@ abhinand__medembed_small_v0_1 = ModelMeta(
|
|
|
1352
1663
|
},
|
|
1353
1664
|
adapted_from="BAAI/bge-base-en-v1.5",
|
|
1354
1665
|
superseded_by=None,
|
|
1666
|
+
citation="""@software{balachandran2024medembed,
|
|
1667
|
+
author = {Balachandran, Abhinand},
|
|
1668
|
+
title = {MedEmbed: Medical-Focused Embedding Models},
|
|
1669
|
+
year = {2024},
|
|
1670
|
+
url = {https://github.com/abhinand5/MedEmbed}
|
|
1671
|
+
}""",
|
|
1355
1672
|
)
|
|
1356
1673
|
avsolatorio__noinstruct_small_embedding_v0 = ModelMeta(
|
|
1357
1674
|
name="avsolatorio/NoInstruct-small-Embedding-v0",
|
|
1675
|
+
model_type=["dense"],
|
|
1358
1676
|
revision="b38747000553d8268915c95a55fc87e707c9aadd",
|
|
1359
1677
|
release_date="2024-05-01",
|
|
1360
1678
|
languages=["eng-Latn"],
|
|
@@ -1377,6 +1695,7 @@ avsolatorio__noinstruct_small_embedding_v0 = ModelMeta(
|
|
|
1377
1695
|
)
|
|
1378
1696
|
brahmairesearch__slx_v0_1 = ModelMeta(
|
|
1379
1697
|
name="brahmairesearch/slx-v0.1",
|
|
1698
|
+
model_type=["dense"],
|
|
1380
1699
|
revision="688c83fd1a7f34b25575a2bc26cfd87c11b4ce71",
|
|
1381
1700
|
release_date="2024-08-13",
|
|
1382
1701
|
languages=["eng-Latn"],
|
|
@@ -1399,6 +1718,7 @@ brahmairesearch__slx_v0_1 = ModelMeta(
|
|
|
1399
1718
|
)
|
|
1400
1719
|
deepfile__embedder_100p = ModelMeta(
|
|
1401
1720
|
name="deepfile/embedder-100p",
|
|
1721
|
+
model_type=["dense"],
|
|
1402
1722
|
revision="aa02f08f11517977fbcdc94dc9dbf9a1ca152d9b",
|
|
1403
1723
|
release_date="2023-07-24",
|
|
1404
1724
|
languages=None,
|
|
@@ -1421,6 +1741,7 @@ deepfile__embedder_100p = ModelMeta(
|
|
|
1421
1741
|
)
|
|
1422
1742
|
infgrad__stella_base_en_v2 = ModelMeta(
|
|
1423
1743
|
name="infgrad/stella-base-en-v2",
|
|
1744
|
+
model_type=["dense"],
|
|
1424
1745
|
revision="c9e80ff9892d80b39dc54e30a7873f91ea161034",
|
|
1425
1746
|
release_date="2023-10-19",
|
|
1426
1747
|
languages=["eng-Latn"],
|
|
@@ -1443,6 +1764,7 @@ infgrad__stella_base_en_v2 = ModelMeta(
|
|
|
1443
1764
|
)
|
|
1444
1765
|
malenia1__ternary_weight_embedding = ModelMeta(
|
|
1445
1766
|
name="malenia1/ternary-weight-embedding",
|
|
1767
|
+
model_type=["dense"],
|
|
1446
1768
|
revision="a1208fb7f646647bb62639fd2e1eb6cc2ef3738e",
|
|
1447
1769
|
release_date="2024-10-23",
|
|
1448
1770
|
languages=None,
|
|
@@ -1465,6 +1787,7 @@ malenia1__ternary_weight_embedding = ModelMeta(
|
|
|
1465
1787
|
)
|
|
1466
1788
|
omarelshehy__arabic_english_sts_matryoshka = ModelMeta(
|
|
1467
1789
|
name="omarelshehy/arabic-english-sts-matryoshka",
|
|
1790
|
+
model_type=["dense"],
|
|
1468
1791
|
revision="763d116fbe8bf7883c64635c862feeaa3768bb64",
|
|
1469
1792
|
release_date="2024-10-13",
|
|
1470
1793
|
languages=["ara-Arab", "eng-Latn"],
|
|
@@ -1484,6 +1807,15 @@ omarelshehy__arabic_english_sts_matryoshka = ModelMeta(
|
|
|
1484
1807
|
training_datasets=None,
|
|
1485
1808
|
adapted_from="FacebookAI/xlm-roberta-large",
|
|
1486
1809
|
superseded_by=None,
|
|
1810
|
+
citation="""@inproceedings{reimers-2019-sentence-bert,
|
|
1811
|
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
|
1812
|
+
author = "Reimers, Nils and Gurevych, Iryna",
|
|
1813
|
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
|
1814
|
+
month = "11",
|
|
1815
|
+
year = "2019",
|
|
1816
|
+
publisher = "Association for Computational Linguistics",
|
|
1817
|
+
url = "https://arxiv.org/abs/1908.10084",
|
|
1818
|
+
}""",
|
|
1487
1819
|
)
|
|
1488
1820
|
openbmb__minicpm_embedding = ModelMeta(
|
|
1489
1821
|
loader=sentence_transformers_loader,
|
|
@@ -1496,6 +1828,7 @@ openbmb__minicpm_embedding = ModelMeta(
|
|
|
1496
1828
|
# https://huggingface.co/openbmb/MiniCPM-Embedding/blob/c0cb2de33fb366e17c30f9d53142ff11bc18e049/README.md?code=true#L405
|
|
1497
1829
|
),
|
|
1498
1830
|
name="openbmb/MiniCPM-Embedding",
|
|
1831
|
+
model_type=["dense"],
|
|
1499
1832
|
revision="c0cb2de33fb366e17c30f9d53142ff11bc18e049",
|
|
1500
1833
|
release_date="2024-09-04",
|
|
1501
1834
|
languages=["zho-Hans", "eng-Latn"],
|
|
@@ -1518,6 +1851,7 @@ openbmb__minicpm_embedding = ModelMeta(
|
|
|
1518
1851
|
|
|
1519
1852
|
silma_ai__silma_embedding_matryoshka_v0_1 = ModelMeta(
|
|
1520
1853
|
name="silma-ai/silma-embeddding-matryoshka-v0.1",
|
|
1854
|
+
model_type=["dense"],
|
|
1521
1855
|
revision="a520977a9542ebdb8a7206df6b7ff6977f1886ea",
|
|
1522
1856
|
release_date="2024-10-12",
|
|
1523
1857
|
languages=["ara-Arab", "eng-Latn"],
|
|
@@ -1537,10 +1871,18 @@ silma_ai__silma_embedding_matryoshka_v0_1 = ModelMeta(
|
|
|
1537
1871
|
training_datasets=None,
|
|
1538
1872
|
adapted_from="/workspace/v3-matryoshka_aubmindlab-bert-base-arabertv02-2024-10-12_13-55-06/checkpoint-26250",
|
|
1539
1873
|
superseded_by=None,
|
|
1874
|
+
citation="""@misc{silma2024embedding,
|
|
1875
|
+
author = {Abu Bakr Soliman, Karim Ouda, SILMA AI},
|
|
1876
|
+
title = {SILMA Embedding Matryoshka 0.1},
|
|
1877
|
+
year = {2024},
|
|
1878
|
+
publisher = {Hugging Face},
|
|
1879
|
+
howpublished = {https://huggingface.co/silma-ai/silma-embeddding-matryoshka-0.1},
|
|
1880
|
+
}""",
|
|
1540
1881
|
)
|
|
1541
1882
|
|
|
1542
1883
|
sbert_chinese_general_v1 = ModelMeta(
|
|
1543
1884
|
name="DMetaSoul/sbert-chinese-general-v1",
|
|
1885
|
+
model_type=["dense"],
|
|
1544
1886
|
revision="bd27765956bcc2fcf682de0097819947ac10037e",
|
|
1545
1887
|
release_date="2022-03-25",
|
|
1546
1888
|
languages=["zho-Hans"],
|
|
@@ -1568,6 +1910,7 @@ sbert_chinese_general_v1 = ModelMeta(
|
|
|
1568
1910
|
|
|
1569
1911
|
dmeta_embedding_zh_small = ModelMeta(
|
|
1570
1912
|
name="DMetaSoul/Dmeta-embedding-zh-small",
|
|
1913
|
+
model_type=["dense"],
|
|
1571
1914
|
revision="2050d3439a2f68999dd648c1697471acaac37a29",
|
|
1572
1915
|
release_date="2024-03-25",
|
|
1573
1916
|
languages=["zho-Hans"],
|
|
@@ -1590,6 +1933,7 @@ dmeta_embedding_zh_small = ModelMeta(
|
|
|
1590
1933
|
|
|
1591
1934
|
xiaobu_embedding = ModelMeta(
|
|
1592
1935
|
name="lier007/xiaobu-embedding",
|
|
1936
|
+
model_type=["dense"],
|
|
1593
1937
|
revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92",
|
|
1594
1938
|
release_date="2024-01-09",
|
|
1595
1939
|
languages=["zho-Hans"],
|
|
@@ -1613,6 +1957,7 @@ xiaobu_embedding = ModelMeta(
|
|
|
1613
1957
|
|
|
1614
1958
|
xiaobu_embedding_v2 = ModelMeta(
|
|
1615
1959
|
name="lier007/xiaobu-embedding-v2",
|
|
1960
|
+
model_type=["dense"],
|
|
1616
1961
|
revision="1912f2e59a5c2ef802a471d735a38702a5c9485e",
|
|
1617
1962
|
release_date="2024-06-30",
|
|
1618
1963
|
languages=["zho-Hans"],
|
|
@@ -1636,6 +1981,7 @@ xiaobu_embedding_v2 = ModelMeta(
|
|
|
1636
1981
|
|
|
1637
1982
|
yinka_embedding = ModelMeta(
|
|
1638
1983
|
name="Classical/Yinka",
|
|
1984
|
+
model_type=["dense"],
|
|
1639
1985
|
revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92",
|
|
1640
1986
|
release_date="2024-01-09",
|
|
1641
1987
|
languages=["zho-Hans"],
|
|
@@ -1658,6 +2004,7 @@ yinka_embedding = ModelMeta(
|
|
|
1658
2004
|
)
|
|
1659
2005
|
conan_embedding = ModelMeta(
|
|
1660
2006
|
name="TencentBAC/Conan-embedding-v1",
|
|
2007
|
+
model_type=["dense"],
|
|
1661
2008
|
revision="bb9749a57d4f02fd71722386f8d0f5a9398d7eeb",
|
|
1662
2009
|
release_date="2024-08-22",
|
|
1663
2010
|
languages=["zho-Hans"],
|
|
@@ -1677,11 +2024,21 @@ conan_embedding = ModelMeta(
|
|
|
1677
2024
|
# source: https://arxiv.org/pdf/2408.15710
|
|
1678
2025
|
training_datasets=None, # They "scraped" things from the internet, we don't know, could be leakage
|
|
1679
2026
|
superseded_by=None,
|
|
2027
|
+
citation="""@misc{li2024conanembeddinggeneraltextembedding,
|
|
2028
|
+
title={Conan-embedding: General Text Embedding with More and Better Negative Samples},
|
|
2029
|
+
author={Shiyu Li and Yang Tang and Shizhe Chen and Xi Chen},
|
|
2030
|
+
year={2024},
|
|
2031
|
+
eprint={2408.15710},
|
|
2032
|
+
archivePrefix={arXiv},
|
|
2033
|
+
primaryClass={cs.CL},
|
|
2034
|
+
url={https://arxiv.org/abs/2408.15710},
|
|
2035
|
+
}""",
|
|
1680
2036
|
)
|
|
1681
2037
|
|
|
1682
2038
|
ember_v1 = ModelMeta(
|
|
1683
2039
|
loader=sentence_transformers_loader,
|
|
1684
2040
|
name="llmrails/ember-v1",
|
|
2041
|
+
model_type=["dense"],
|
|
1685
2042
|
revision="5e5ce5904901f6ce1c353a95020f17f09e5d021d",
|
|
1686
2043
|
release_date="2023-10-10",
|
|
1687
2044
|
languages=["eng-Latn"],
|
|
@@ -1699,4 +2056,9 @@ ember_v1 = ModelMeta(
|
|
|
1699
2056
|
use_instructions=None,
|
|
1700
2057
|
training_datasets=None,
|
|
1701
2058
|
superseded_by=None,
|
|
2059
|
+
citation="""@misc{nur2024emberv1,
|
|
2060
|
+
title={ember-v1: SOTA embedding model},
|
|
2061
|
+
author={Enrike Nur and Anar Aliyev},
|
|
2062
|
+
year={2023},
|
|
2063
|
+
}""",
|
|
1702
2064
|
)
|