mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +6 -0
- mteb/_create_dataloaders.py +22 -20
- mteb/_evaluators/any_sts_evaluator.py +23 -14
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +3 -3
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
- mteb/_evaluators/pair_classification_evaluator.py +34 -40
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +25 -37
- mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
- mteb/_evaluators/text/summarization_evaluator.py +27 -20
- mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +105 -0
- mteb/abstasks/_statistics_calculation.py +23 -11
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -12
- mteb/abstasks/clustering.py +20 -16
- mteb/abstasks/clustering_legacy.py +13 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +33 -22
- mteb/abstasks/pair_classification.py +27 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +14 -4
- mteb/abstasks/task_metadata.py +32 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +77 -16
- mteb/benchmarks/benchmarks/__init__.py +12 -0
- mteb/benchmarks/benchmarks/benchmarks.py +361 -16
- mteb/benchmarks/get_benchmark.py +14 -53
- mteb/cache.py +227 -37
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +71 -62
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +106 -75
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +414 -151
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/load_results.py +12 -12
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +31 -23
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +3 -3
- mteb/models/get_model_meta.py +25 -118
- mteb/models/instruct_wrapper.py +33 -9
- mteb/models/model_implementations/align_models.py +8 -1
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +9 -1
- mteb/models/model_implementations/arctic_models.py +16 -8
- mteb/models/model_implementations/b1ade_models.py +2 -1
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +101 -17
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +13 -2
- mteb/models/model_implementations/blip_models.py +43 -16
- mteb/models/model_implementations/bm25.py +5 -4
- mteb/models/model_implementations/bmretriever_models.py +10 -4
- mteb/models/model_implementations/cadet_models.py +10 -1
- mteb/models/model_implementations/cde_models.py +25 -4
- mteb/models/model_implementations/clip_models.py +9 -6
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +165 -3
- mteb/models/model_implementations/codesage_models.py +18 -3
- mteb/models/model_implementations/cohere_models.py +13 -6
- mteb/models/model_implementations/cohere_v.py +7 -2
- mteb/models/model_implementations/colpali_models.py +17 -9
- mteb/models/model_implementations/colqwen_models.py +275 -5
- mteb/models/model_implementations/colsmol_models.py +4 -2
- mteb/models/model_implementations/conan_models.py +2 -1
- mteb/models/model_implementations/dino_models.py +194 -23
- mteb/models/model_implementations/e5_instruct.py +27 -4
- mteb/models/model_implementations/e5_models.py +21 -110
- mteb/models/model_implementations/e5_v.py +7 -6
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +2 -1
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +67 -9
- mteb/models/model_implementations/facebookai.py +205 -0
- mteb/models/model_implementations/geogpt_models.py +2 -1
- mteb/models/model_implementations/gme_v_models.py +17 -10
- mteb/models/model_implementations/google_models.py +17 -6
- mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
- mteb/models/model_implementations/gritlm_models.py +4 -2
- mteb/models/model_implementations/gte_models.py +99 -9
- mteb/models/model_implementations/hinvec_models.py +2 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +36 -6
- mteb/models/model_implementations/inf_models.py +4 -2
- mteb/models/model_implementations/jasper_models.py +256 -3
- mteb/models/model_implementations/jina_clip.py +49 -10
- mteb/models/model_implementations/jina_models.py +222 -11
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +37 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +2 -1
- mteb/models/model_implementations/linq_models.py +4 -3
- mteb/models/model_implementations/listconranker.py +2 -2
- mteb/models/model_implementations/llm2clip_models.py +9 -6
- mteb/models/model_implementations/llm2vec_models.py +16 -8
- mteb/models/model_implementations/mcinext_models.py +7 -1
- mteb/models/model_implementations/mdbr_models.py +19 -3
- mteb/models/model_implementations/misc_models.py +422 -60
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +15 -4
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +27 -14
- mteb/models/model_implementations/moka_models.py +4 -1
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
- mteb/models/model_implementations/nomic_models.py +173 -6
- mteb/models/model_implementations/nomic_models_vision.py +8 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
- mteb/models/model_implementations/nvidia_models.py +155 -20
- mteb/models/model_implementations/octen_models.py +254 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +37 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
- mteb/models/model_implementations/ops_moa_models.py +5 -3
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +9 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +12 -8
- mteb/models/model_implementations/pylate_models.py +46 -12
- mteb/models/model_implementations/qodo_models.py +4 -2
- mteb/models/model_implementations/qtack_models.py +2 -1
- mteb/models/model_implementations/qwen3_models.py +9 -6
- mteb/models/model_implementations/qzhou_models.py +5 -3
- mteb/models/model_implementations/random_baseline.py +19 -24
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +2 -1
- mteb/models/model_implementations/repllama_models.py +5 -3
- mteb/models/model_implementations/rerankers_custom.py +15 -9
- mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
- mteb/models/model_implementations/richinfoai_models.py +2 -1
- mteb/models/model_implementations/ru_sentence_models.py +71 -20
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +6 -3
- mteb/models/model_implementations/samilpwc_models.py +2 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +2 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +177 -18
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +30 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +23 -4
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +11 -1
- mteb/models/model_implementations/uae_models.py +8 -1
- mteb/models/model_implementations/vdr_models.py +3 -1
- mteb/models/model_implementations/vi_vn_models.py +45 -6
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +5 -3
- mteb/models/model_implementations/voyage_models.py +99 -0
- mteb/models/model_implementations/voyage_v.py +17 -9
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +2 -1
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +498 -29
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
- mteb/models/search_wrappers.py +197 -65
- mteb/models/sentence_transformer_wrapper.py +52 -32
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +114 -65
- mteb/results/model_result.py +63 -26
- mteb/results/task_result.py +117 -77
- mteb/similarity_functions.py +60 -7
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +2 -3
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +2 -3
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +16 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +24 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +19 -2
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
- mteb/models/model_implementations/mxbai_models.py +0 -102
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
|
@@ -5,11 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
|
|
|
5
5
|
class FiQA2018VN(AbsTaskRetrieval):
|
|
6
6
|
metadata = TaskMetadata(
|
|
7
7
|
name="FiQA2018-VN",
|
|
8
|
-
description="
|
|
9
|
-
The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
|
|
10
|
-
- The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
|
|
11
|
-
- Applies advanced embedding models to filter the translations.
|
|
12
|
-
- Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
|
|
8
|
+
description="A translated dataset from Financial Opinion Mining and Question Answering The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
13
9
|
reference="https://sites.google.com/view/fiqa/",
|
|
14
10
|
dataset={
|
|
15
11
|
"path": "GreenNode/fiqa-vn",
|
|
@@ -26,5 +26,20 @@ class GreenNodeTableMarkdownRetrieval(AbsTaskRetrieval):
|
|
|
26
26
|
annotations_creators="human-annotated",
|
|
27
27
|
dialect=[],
|
|
28
28
|
sample_creation="found",
|
|
29
|
-
bibtex_citation=""
|
|
29
|
+
bibtex_citation=r"""
|
|
30
|
+
@inproceedings{10.1007/978-981-95-1746-6_17,
|
|
31
|
+
abstract = {Information retrieval often comes in plain text, lacking semi-structured text such as HTML and markdown, retrieving data that contains rich format such as table became non-trivial. In this paper, we tackle this challenge by introducing a new dataset, GreenNode Table Retrieval VN (GN-TRVN), which is collected from a massive corpus, a wide range of topics, and a longer context compared to ViQuAD2.0. To evaluate the effectiveness of our proposed dataset, we introduce two versions, M3-GN-VN and M3-GN-VN-Mixed, by fine-tuning the M3-Embedding model on this dataset. Experimental results show that our models consistently outperform the baselines, including the base model, across most evaluation criteria on various datasets such as VieQuADRetrieval, ZacLegalTextRetrieval, and GN-TRVN. In general, we release a more comprehensive dataset and two model versions that improve response performance for Vietnamese Markdown Table Retrieval.},
|
|
32
|
+
address = {Singapore},
|
|
33
|
+
author = {Pham, Bao Loc
|
|
34
|
+
and Hoang, Quoc Viet
|
|
35
|
+
and Luu, Quy Tung
|
|
36
|
+
and Vo, Trong Thu},
|
|
37
|
+
booktitle = {Proceedings of the Fifth International Conference on Intelligent Systems and Networks},
|
|
38
|
+
isbn = {978-981-95-1746-6},
|
|
39
|
+
pages = {153--163},
|
|
40
|
+
publisher = {Springer Nature Singapore},
|
|
41
|
+
title = {GN-TRVN: A Benchmark for Vietnamese Table Markdown Retrieval Task},
|
|
42
|
+
year = {2026},
|
|
43
|
+
}
|
|
44
|
+
""",
|
|
30
45
|
)
|
|
@@ -9,12 +9,7 @@ class HotpotQAVN(AbsTaskRetrieval):
|
|
|
9
9
|
"path": "GreenNode/hotpotqa-vn",
|
|
10
10
|
"revision": "8a5220c7af5084f0d5d2afeb74f9c2b41b759ff0",
|
|
11
11
|
},
|
|
12
|
-
description="
|
|
13
|
-
supervision for supporting facts to enable more explainable question answering systems.
|
|
14
|
-
The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
|
|
15
|
-
- The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
|
|
16
|
-
- Applies advanced embedding models to filter the translations.
|
|
17
|
-
- Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
|
|
12
|
+
description="A translated dataset from HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong supervision for supporting facts to enable more explainable question answering systems. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
18
13
|
reference="https://hotpotqa.github.io/",
|
|
19
14
|
type="Retrieval",
|
|
20
15
|
category="t2t",
|
|
@@ -41,3 +36,42 @@ class HotpotQAVN(AbsTaskRetrieval):
|
|
|
41
36
|
""",
|
|
42
37
|
adapted_from=["HotpotQA"],
|
|
43
38
|
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class NanoHotpotQAVN(AbsTaskRetrieval):
|
|
42
|
+
metadata = TaskMetadata(
|
|
43
|
+
name="NanoHotpotQA-VN",
|
|
44
|
+
dataset={
|
|
45
|
+
"path": "GreenNode/nano-hotpotqa-vn",
|
|
46
|
+
"revision": "f4de19a2fae1a582de114e5bcd178bb262183113",
|
|
47
|
+
},
|
|
48
|
+
description="NanoHotpotQAVN is a small version of A translated dataset from HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong supervision for supporting facts to enable more explainable question answering systems. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
49
|
+
reference="https://hotpotqa.github.io/",
|
|
50
|
+
type="Retrieval",
|
|
51
|
+
category="t2t",
|
|
52
|
+
eval_splits=["test"],
|
|
53
|
+
eval_langs=["vie-Latn"],
|
|
54
|
+
main_score="ndcg_at_10",
|
|
55
|
+
date=("2025-07-29", "2025-07-30"),
|
|
56
|
+
license="cc-by-sa-4.0",
|
|
57
|
+
annotations_creators="derived",
|
|
58
|
+
dialect=[],
|
|
59
|
+
sample_creation="machine-translated and LM verified",
|
|
60
|
+
domains=["Web", "Written"],
|
|
61
|
+
task_subtypes=["Question answering"],
|
|
62
|
+
bibtex_citation=r"""
|
|
63
|
+
@misc{pham2025vnmtebvietnamesemassivetext,
|
|
64
|
+
archiveprefix = {arXiv},
|
|
65
|
+
author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
|
|
66
|
+
eprint = {2507.21500},
|
|
67
|
+
primaryclass = {cs.CL},
|
|
68
|
+
title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
|
|
69
|
+
url = {https://arxiv.org/abs/2507.21500},
|
|
70
|
+
year = {2025},
|
|
71
|
+
}
|
|
72
|
+
""",
|
|
73
|
+
prompt={
|
|
74
|
+
"query": "Given a multi-hop question, retrieve documents that can help answer the question"
|
|
75
|
+
},
|
|
76
|
+
adapted_from=["HotpotQA-VN"],
|
|
77
|
+
)
|
|
@@ -9,11 +9,7 @@ class MSMARCOVN(AbsTaskRetrieval):
|
|
|
9
9
|
"path": "GreenNode/msmarco-vn",
|
|
10
10
|
"revision": "85d1ad4cc9070b8d019d65f5af1631a2ab91e294",
|
|
11
11
|
},
|
|
12
|
-
description="
|
|
13
|
-
The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
|
|
14
|
-
- The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
|
|
15
|
-
- Applies advanced embedding models to filter the translations.
|
|
16
|
-
- Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
|
|
12
|
+
description="A translated dataset from MS MARCO is a collection of datasets focused on deep learning in search The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
17
13
|
reference="https://microsoft.github.io/msmarco/",
|
|
18
14
|
type="Retrieval",
|
|
19
15
|
category="t2t",
|
|
@@ -51,3 +47,51 @@ class MSMARCOVN(AbsTaskRetrieval):
|
|
|
51
47
|
""",
|
|
52
48
|
adapted_from=["MSMARCO"],
|
|
53
49
|
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class NanoMSMARCOVN(AbsTaskRetrieval):
|
|
53
|
+
metadata = TaskMetadata(
|
|
54
|
+
name="NanoMSMARCO-VN",
|
|
55
|
+
dataset={
|
|
56
|
+
"path": "GreenNode/nano-msmarco-vn",
|
|
57
|
+
"revision": "f149369c82ec228b05b0f6677699ab4bfbab73f6",
|
|
58
|
+
},
|
|
59
|
+
description="NanoMSMARCOVN is a small version of A translated dataset from MS MARCO is a collection of datasets focused on deep learning in search The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
60
|
+
reference="https://microsoft.github.io/msmarco/",
|
|
61
|
+
type="Retrieval",
|
|
62
|
+
category="t2t",
|
|
63
|
+
eval_splits=["dev"],
|
|
64
|
+
eval_langs=["vie-Latn"],
|
|
65
|
+
main_score="ndcg_at_10",
|
|
66
|
+
date=("2025-07-29", "2025-07-30"),
|
|
67
|
+
license="cc-by-sa-4.0",
|
|
68
|
+
annotations_creators="derived",
|
|
69
|
+
dialect=[],
|
|
70
|
+
sample_creation="machine-translated and LM verified",
|
|
71
|
+
domains=[
|
|
72
|
+
"Encyclopaedic",
|
|
73
|
+
"Academic",
|
|
74
|
+
"Blog",
|
|
75
|
+
"News",
|
|
76
|
+
"Medical",
|
|
77
|
+
"Government",
|
|
78
|
+
"Reviews",
|
|
79
|
+
"Non-fiction",
|
|
80
|
+
"Social",
|
|
81
|
+
"Web",
|
|
82
|
+
],
|
|
83
|
+
task_subtypes=["Question answering"],
|
|
84
|
+
bibtex_citation=r"""
|
|
85
|
+
@misc{pham2025vnmtebvietnamesemassivetext,
|
|
86
|
+
archiveprefix = {arXiv},
|
|
87
|
+
author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
|
|
88
|
+
eprint = {2507.21500},
|
|
89
|
+
primaryclass = {cs.CL},
|
|
90
|
+
title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
|
|
91
|
+
url = {https://arxiv.org/abs/2507.21500},
|
|
92
|
+
year = {2025},
|
|
93
|
+
}
|
|
94
|
+
""",
|
|
95
|
+
prompt={"query": "Given a query, retrieve relevant documents from MS MARCO-VN"},
|
|
96
|
+
adapted_from=["MSMARCO-VN"],
|
|
97
|
+
)
|
|
@@ -9,11 +9,7 @@ class NFCorpusVN(AbsTaskRetrieval):
|
|
|
9
9
|
"path": "GreenNode/nfcorpus-vn",
|
|
10
10
|
"revision": "a13d72fbb859be3dc19ab669d1ec9510407d2dcd",
|
|
11
11
|
},
|
|
12
|
-
description="
|
|
13
|
-
The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
|
|
14
|
-
- The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
|
|
15
|
-
- Applies advanced embedding models to filter the translations.
|
|
16
|
-
- Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
|
|
12
|
+
description="A translated dataset from NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
17
13
|
reference="https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/",
|
|
18
14
|
type="Retrieval",
|
|
19
15
|
category="t2t",
|
|
@@ -9,11 +9,7 @@ class NQVN(AbsTaskRetrieval):
|
|
|
9
9
|
"path": "GreenNode/nq-vn",
|
|
10
10
|
"revision": "40a6d7f343b9c9f4855a426d8c431ad5f8aaf56b",
|
|
11
11
|
},
|
|
12
|
-
description="
|
|
13
|
-
The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
|
|
14
|
-
- The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
|
|
15
|
-
- Applies advanced embedding models to filter the translations.
|
|
16
|
-
- Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
|
|
12
|
+
description="A translated dataset from NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
17
13
|
reference="https://ai.google.com/research/NaturalQuestions/",
|
|
18
14
|
type="Retrieval",
|
|
19
15
|
category="t2t",
|
|
@@ -40,3 +36,42 @@ class NQVN(AbsTaskRetrieval):
|
|
|
40
36
|
""",
|
|
41
37
|
adapted_from=["NQ"],
|
|
42
38
|
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class NanoNQVN(AbsTaskRetrieval):
|
|
42
|
+
metadata = TaskMetadata(
|
|
43
|
+
name="NanoNQ-VN",
|
|
44
|
+
dataset={
|
|
45
|
+
"path": "GreenNode/nano-nq-vn",
|
|
46
|
+
"revision": "1ad4d6556fe0e5314994839089ce070fb0db8b19",
|
|
47
|
+
},
|
|
48
|
+
description="NanoNQVN is a small version of A translated dataset from NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
49
|
+
reference="https://ai.google.com/research/NaturalQuestions/",
|
|
50
|
+
type="Retrieval",
|
|
51
|
+
category="t2t",
|
|
52
|
+
eval_splits=["test"],
|
|
53
|
+
eval_langs=["vie-Latn"],
|
|
54
|
+
main_score="ndcg_at_10",
|
|
55
|
+
date=("2025-07-29", "2025-07-30"),
|
|
56
|
+
license="cc-by-sa-4.0",
|
|
57
|
+
annotations_creators="derived",
|
|
58
|
+
dialect=[],
|
|
59
|
+
sample_creation="machine-translated and LM verified",
|
|
60
|
+
domains=["Written", "Encyclopaedic"],
|
|
61
|
+
task_subtypes=["Question answering"],
|
|
62
|
+
bibtex_citation=r"""
|
|
63
|
+
@misc{pham2025vnmtebvietnamesemassivetext,
|
|
64
|
+
archiveprefix = {arXiv},
|
|
65
|
+
author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang},
|
|
66
|
+
eprint = {2507.21500},
|
|
67
|
+
primaryclass = {cs.CL},
|
|
68
|
+
title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark},
|
|
69
|
+
url = {https://arxiv.org/abs/2507.21500},
|
|
70
|
+
year = {2025},
|
|
71
|
+
}
|
|
72
|
+
""",
|
|
73
|
+
prompt={
|
|
74
|
+
"query": "Given a question, retrieve Wikipedia passages that answer the question"
|
|
75
|
+
},
|
|
76
|
+
adapted_from=["NQ-VN"],
|
|
77
|
+
)
|
|
@@ -9,12 +9,7 @@ class QuoraVN(AbsTaskRetrieval):
|
|
|
9
9
|
"path": "GreenNode/quora-vn",
|
|
10
10
|
"revision": "3363d81e41b67c1032bf3b234882a03d271e2289",
|
|
11
11
|
},
|
|
12
|
-
description="
|
|
13
|
-
question, find other (duplicate) questions.
|
|
14
|
-
The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
|
|
15
|
-
- The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
|
|
16
|
-
- Applies advanced embedding models to filter the translations.
|
|
17
|
-
- Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
|
|
12
|
+
description="A translated dataset from QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a question, find other (duplicate) questions. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
18
13
|
reference="https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs",
|
|
19
14
|
type="Retrieval",
|
|
20
15
|
category="t2t",
|
|
@@ -9,11 +9,7 @@ class SciFactVN(AbsTaskRetrieval):
|
|
|
9
9
|
"path": "GreenNode/scifact-vn",
|
|
10
10
|
"revision": "483a7cf890c523c954e7751d328c5bb65061dcff",
|
|
11
11
|
},
|
|
12
|
-
description="
|
|
13
|
-
The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
|
|
14
|
-
- The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
|
|
15
|
-
- Applies advanced embedding models to filter the translations.
|
|
16
|
-
- Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
|
|
12
|
+
description="A translated dataset from SciFact verifies scientific claims using evidence from the research literature containing scientific paper abstracts. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
17
13
|
reference="https://github.com/allenai/scifact",
|
|
18
14
|
type="Retrieval",
|
|
19
15
|
category="t2t",
|
|
@@ -9,12 +9,7 @@ class SCIDOCSVN(AbsTaskRetrieval):
|
|
|
9
9
|
"path": "GreenNode/scidocs-vn",
|
|
10
10
|
"revision": "724cddfa9d328a193f303a0a9b7789468ac79f26",
|
|
11
11
|
},
|
|
12
|
-
description="
|
|
13
|
-
prediction, to document classification and recommendation.
|
|
14
|
-
The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
|
|
15
|
-
- The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
|
|
16
|
-
- Applies advanced embedding models to filter the translations.
|
|
17
|
-
- Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
|
|
12
|
+
description="A translated dataset from SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
18
13
|
reference="https://allenai.org/data/scidocs",
|
|
19
14
|
type="Retrieval",
|
|
20
15
|
category="t2t",
|
|
@@ -5,11 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
|
|
|
5
5
|
class Touche2020VN(AbsTaskRetrieval):
|
|
6
6
|
metadata = TaskMetadata(
|
|
7
7
|
name="Touche2020-VN",
|
|
8
|
-
description="
|
|
9
|
-
The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
|
|
10
|
-
- The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
|
|
11
|
-
- Applies advanced embedding models to filter the translations.
|
|
12
|
-
- Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
|
|
8
|
+
description="A translated dataset from Touché Task 1: Argument Retrieval for Controversial Questions The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
13
9
|
reference="https://webis.de/events/touche-20/shared-task-1.html",
|
|
14
10
|
dataset={
|
|
15
11
|
"path": "GreenNode/webis-touche2020-vn",
|
|
@@ -5,11 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
|
|
|
5
5
|
class TRECCOVIDVN(AbsTaskRetrieval):
|
|
6
6
|
metadata = TaskMetadata(
|
|
7
7
|
name="TRECCOVID-VN",
|
|
8
|
-
description="
|
|
9
|
-
The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
|
|
10
|
-
- The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
|
|
11
|
-
- Applies advanced embedding models to filter the translations.
|
|
12
|
-
- Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
|
|
8
|
+
description="A translated dataset from TRECCOVID is an ad-hoc search challenge based on the COVID-19 dataset containing scientific articles related to the COVID-19 pandemic. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
13
9
|
reference="https://ir.nist.gov/covidSubmit/index.html",
|
|
14
10
|
dataset={
|
|
15
11
|
"path": "GreenNode/trec-covid-vn",
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from mteb.abstasks.retrieval import AbsTaskRetrieval
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
TEST_SAMPLES = 2048
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TVPLRetrieval(AbsTaskRetrieval):
|
|
8
|
+
metadata = TaskMetadata(
|
|
9
|
+
name="TVPLRetrieval",
|
|
10
|
+
description="A Vietnamese dataset for evaluating legal text retrieval. From Thu vien phap luat (TVPL) dataset: Optimizing Answer Generator in Vietnamese Legal Question Answering Systems Using Language Models.",
|
|
11
|
+
reference="https://aclanthology.org/2020.coling-main.233.pdf",
|
|
12
|
+
dataset={
|
|
13
|
+
"path": "GreenNode/TVPL-Retrieval-VN",
|
|
14
|
+
"revision": "6661dba4dfedff606537732d9f35f2c3738b081a",
|
|
15
|
+
},
|
|
16
|
+
type="Retrieval",
|
|
17
|
+
category="t2t",
|
|
18
|
+
modalities=["text"],
|
|
19
|
+
eval_splits=["test"],
|
|
20
|
+
eval_langs=["vie-Latn"],
|
|
21
|
+
main_score="ndcg_at_10",
|
|
22
|
+
date=("2025-07-29", "2025-07-30"),
|
|
23
|
+
license="cc-by-sa-4.0",
|
|
24
|
+
dialect=[],
|
|
25
|
+
annotations_creators="human-annotated",
|
|
26
|
+
domains=["Legal"],
|
|
27
|
+
task_subtypes=["Question answering"],
|
|
28
|
+
sample_creation="found",
|
|
29
|
+
bibtex_citation=r"""
|
|
30
|
+
@article{10.1145/3732938,
|
|
31
|
+
address = {New York, NY, USA},
|
|
32
|
+
author = {Le, Huong and Luu, Ngoc and Nguyen, Thanh and Dao, Tuan and Dinh, Sang},
|
|
33
|
+
doi = {10.1145/3732938},
|
|
34
|
+
issn = {2375-4699},
|
|
35
|
+
journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.},
|
|
36
|
+
publisher = {Association for Computing Machinery},
|
|
37
|
+
title = {Optimizing Answer Generator in Vietnamese Legal Question Answering Systems Using Language Models},
|
|
38
|
+
url = {https://doi.org/10.1145/3732938},
|
|
39
|
+
year = {2025},
|
|
40
|
+
}
|
|
41
|
+
""",
|
|
42
|
+
)
|
|
@@ -24,5 +24,19 @@ class ZacLegalTextRetrieval(AbsTaskRetrieval):
|
|
|
24
24
|
annotations_creators="human-annotated",
|
|
25
25
|
dialect=[],
|
|
26
26
|
sample_creation="found",
|
|
27
|
-
bibtex_citation=""
|
|
27
|
+
bibtex_citation=r"""
|
|
28
|
+
@inproceedings{10.1007/978-981-95-1746-6_17,
|
|
29
|
+
address = {Singapore},
|
|
30
|
+
author = {Pham, Bao Loc
|
|
31
|
+
and Hoang, Quoc Viet
|
|
32
|
+
and Luu, Quy Tung
|
|
33
|
+
and Vo, Trong Thu},
|
|
34
|
+
booktitle = {Proceedings of the Fifth International Conference on Intelligent Systems and Networks},
|
|
35
|
+
isbn = {978-981-95-1746-6},
|
|
36
|
+
pages = {153--163},
|
|
37
|
+
publisher = {Springer Nature Singapore},
|
|
38
|
+
title = {GN-TRVN: A Benchmark for Vietnamese Table Markdown Retrieval Task},
|
|
39
|
+
year = {2026},
|
|
40
|
+
}
|
|
41
|
+
""",
|
|
28
42
|
)
|
|
@@ -9,11 +9,7 @@ class BiossesSTSVN(AbsTaskSTS):
|
|
|
9
9
|
"path": "GreenNode/biosses-sts-vn",
|
|
10
10
|
"revision": "1dae4a6df91c0852680cd4ab48c8c1d8a9ed49b2",
|
|
11
11
|
},
|
|
12
|
-
description="
|
|
13
|
-
The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
|
|
14
|
-
- The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
|
|
15
|
-
- Applies advanced embedding models to filter the translations.
|
|
16
|
-
- Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
|
|
12
|
+
description="A translated dataset from Biomedical Semantic Similarity Estimation. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
17
13
|
reference="https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html",
|
|
18
14
|
type="STS",
|
|
19
15
|
category="t2c",
|
|
@@ -9,11 +9,7 @@ class SickrSTSVN(AbsTaskSTS):
|
|
|
9
9
|
"path": "GreenNode/sickr-sts-vn",
|
|
10
10
|
"revision": "bc89f0401983c456b609f7fb324278f346b2cccf",
|
|
11
11
|
},
|
|
12
|
-
description="
|
|
13
|
-
The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
|
|
14
|
-
- The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
|
|
15
|
-
- Applies advanced embedding models to filter the translations.
|
|
16
|
-
- Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
|
|
12
|
+
description="A translated dataset from Semantic Textual Similarity SICK-R dataset as described here: The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
17
13
|
reference="https://aclanthology.org/2020.lrec-1.207",
|
|
18
14
|
type="STS",
|
|
19
15
|
category="t2c",
|
|
@@ -9,11 +9,7 @@ class STSBenchmarkSTSVN(AbsTaskSTS):
|
|
|
9
9
|
"path": "GreenNode/stsbenchmark-sts-vn",
|
|
10
10
|
"revision": "f24d66738cda4a02138ada5af7689a92ce1fcad6",
|
|
11
11
|
},
|
|
12
|
-
description="
|
|
13
|
-
The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system:
|
|
14
|
-
- The system uses large language models (LLMs), specifically Coherence's Aya model, for translation.
|
|
15
|
-
- Applies advanced embedding models to filter the translations.
|
|
16
|
-
- Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""",
|
|
12
|
+
description="A translated dataset from Semantic Textual Similarity Benchmark (STSbenchmark) dataset. The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. - Applies advanced embedding models to filter the translations. - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.",
|
|
17
13
|
reference="https://github.com/PhilipMay/stsb-multi-mt/",
|
|
18
14
|
type="STS",
|
|
19
15
|
category="t2c",
|
|
@@ -9,7 +9,7 @@ from mteb.abstasks.zeroshot_classification import (
|
|
|
9
9
|
class GTSRBZeroShotClassification(AbsTaskZeroShotClassification):
|
|
10
10
|
metadata = TaskMetadata(
|
|
11
11
|
name="GTSRBZeroShot",
|
|
12
|
-
description="
|
|
12
|
+
description="The German Traffic Sign Recognition Benchmark (GTSRB) is a multi-class classification dataset for traffic signs. It consists of dataset of more than 50,000 traffic sign images. The dataset comprises 43 classes with unbalanced class frequencies.",
|
|
13
13
|
reference="https://benchmark.ini.rub.de/",
|
|
14
14
|
dataset={
|
|
15
15
|
"path": "clip-benchmark/wds_gtsrb",
|
|
@@ -9,7 +9,7 @@ from mteb.abstasks.zeroshot_classification import (
|
|
|
9
9
|
class PatchCamelyonZeroShotClassification(AbsTaskZeroShotClassification):
|
|
10
10
|
metadata = TaskMetadata(
|
|
11
11
|
name="PatchCamelyonZeroShot",
|
|
12
|
-
description="
|
|
12
|
+
description="Histopathology diagnosis classification dataset.",
|
|
13
13
|
reference="https://link.springer.com/chapter/10.1007/978-3-030-00934-2_24",
|
|
14
14
|
dataset={
|
|
15
15
|
"path": "clip-benchmark/wds_vtab-pcam",
|
|
@@ -7,11 +7,7 @@ from mteb.abstasks.zeroshot_classification import (
|
|
|
7
7
|
class UCF101ZeroShotClassification(AbsTaskZeroShotClassification):
|
|
8
8
|
metadata = TaskMetadata(
|
|
9
9
|
name="UCF101ZeroShot",
|
|
10
|
-
description="
|
|
11
|
-
action videos collected from YouTube, having 101 action categories. This
|
|
12
|
-
version of the dataset does not contain images but images saved frame by
|
|
13
|
-
frame. Train and test splits are generated based on the authors' first
|
|
14
|
-
version train/test list.""",
|
|
10
|
+
description="UCF101 is an action recognition data set of realistic action videos collected from YouTube, having 101 action categories. This version of the dataset does not contain images but images saved frame by frame. Train and test splits are generated based on the authors' first version train/test list.",
|
|
15
11
|
reference="https://huggingface.co/datasets/flwrlabs/ucf101",
|
|
16
12
|
dataset={
|
|
17
13
|
"path": "flwrlabs/ucf101",
|
mteb/types/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@ from ._encoder_io import (
|
|
|
4
4
|
Conversation,
|
|
5
5
|
ConversationTurn,
|
|
6
6
|
CorpusDatasetType,
|
|
7
|
+
EncodeKwargs,
|
|
7
8
|
InstructionDatasetType,
|
|
8
9
|
PromptType,
|
|
9
10
|
QueryDatasetType,
|
|
@@ -30,6 +31,7 @@ __all__ = [
|
|
|
30
31
|
"Conversation",
|
|
31
32
|
"ConversationTurn",
|
|
32
33
|
"CorpusDatasetType",
|
|
34
|
+
"EncodeKwargs",
|
|
33
35
|
"HFSubset",
|
|
34
36
|
"ISOLanguage",
|
|
35
37
|
"ISOLanguageScript",
|
mteb/types/_encoder_io.py
CHANGED
|
@@ -1,13 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from collections.abc import Mapping
|
|
2
4
|
from enum import Enum
|
|
3
|
-
from typing import TypedDict
|
|
5
|
+
from typing import TYPE_CHECKING, TypedDict
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
import torch
|
|
7
9
|
from datasets import Dataset
|
|
8
|
-
from PIL import Image
|
|
9
10
|
from typing_extensions import NotRequired
|
|
10
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from PIL import Image
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EncodeKwargs(TypedDict):
|
|
17
|
+
"""Keyword arguments for encoding methods.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
batch_size: The batch size to use for encoding.
|
|
21
|
+
show_progress_bar: Whether to show a progress bar during encoding.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
batch_size: NotRequired[int]
|
|
25
|
+
show_progress_bar: NotRequired[bool]
|
|
26
|
+
|
|
27
|
+
|
|
11
28
|
# --- Output types ---
|
|
12
29
|
Array = np.ndarray | torch.Tensor
|
|
13
30
|
"""General array type, can be a numpy array or a torch tensor."""
|
mteb/types/_result.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from collections.abc import Mapping
|
|
1
2
|
from typing import Any, NamedTuple
|
|
2
3
|
|
|
3
4
|
HFSubset = str
|
|
@@ -8,7 +9,7 @@ SplitName = str
|
|
|
8
9
|
Score = Any
|
|
9
10
|
"""A score value, could e.g. be accuracy. Normally it is a float or int, but it can take on any value. Should be json serializable."""
|
|
10
11
|
|
|
11
|
-
ScoresDict =
|
|
12
|
+
ScoresDict = Mapping[str, Score]
|
|
12
13
|
"""A dictionary of scores, typically also include metadata, e.g {'main_score': 0.5, 'accuracy': 0.5, 'f1': 0.6, 'hf_subset': 'en-de', 'languages': ['eng-Latn', 'deu-Latn']}"""
|
|
13
14
|
|
|
14
15
|
|
mteb/types/statistics.py
CHANGED
|
@@ -10,8 +10,14 @@ class SplitDescriptiveStatistics(TypedDict):
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class DescriptiveStatistics(TypedDict, SplitDescriptiveStatistics):
|
|
13
|
-
"""Class for descriptive statistics for the full task.
|
|
13
|
+
"""Class for descriptive statistics for the full task.
|
|
14
14
|
|
|
15
|
+
Attributes:
|
|
16
|
+
num_samples: Total number of samples
|
|
17
|
+
hf_subset_descriptive_stats: HFSubset descriptive statistics (only for multilingual datasets)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
num_samples: int
|
|
15
21
|
hf_subset_descriptive_stats: NotRequired[dict[HFSubset, SplitDescriptiveStatistics]]
|
|
16
22
|
|
|
17
23
|
|
|
@@ -88,9 +94,9 @@ class ScoreStatistics(TypedDict):
|
|
|
88
94
|
max_score: Maximum score
|
|
89
95
|
"""
|
|
90
96
|
|
|
91
|
-
min_score: int
|
|
97
|
+
min_score: int | float
|
|
92
98
|
avg_score: float
|
|
93
|
-
max_score: int
|
|
99
|
+
max_score: int | float
|
|
94
100
|
|
|
95
101
|
|
|
96
102
|
class TopRankedStatistics(TypedDict):
|