mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +4 -0
- mteb/_create_dataloaders.py +6 -3
- mteb/_evaluators/any_sts_evaluator.py +21 -12
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/_evaluators/clustering_evaluator.py +1 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
- mteb/_evaluators/pair_classification_evaluator.py +30 -38
- mteb/_evaluators/sklearn_evaluator.py +15 -28
- mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
- mteb/_evaluators/text/summarization_evaluator.py +4 -2
- mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +102 -0
- mteb/abstasks/_statistics_calculation.py +6 -2
- mteb/abstasks/classification.py +0 -2
- mteb/abstasks/clustering.py +1 -1
- mteb/abstasks/clustering_legacy.py +3 -0
- mteb/abstasks/multilabel_classification.py +10 -3
- mteb/abstasks/pair_classification.py +8 -1
- mteb/abstasks/sts.py +7 -0
- mteb/abstasks/task_metadata.py +1 -0
- mteb/benchmarks/_create_table.py +84 -37
- mteb/benchmarks/benchmark.py +74 -15
- mteb/benchmarks/benchmarks/__init__.py +8 -0
- mteb/benchmarks/benchmarks/benchmarks.py +259 -15
- mteb/benchmarks/get_benchmark.py +2 -0
- mteb/cache.py +47 -10
- mteb/deprecated_evaluator.py +8 -13
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
- mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
- mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
- mteb/evaluate.py +65 -45
- mteb/leaderboard/app.py +268 -133
- mteb/leaderboard/benchmark_selector.py +14 -5
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +82 -17
- mteb/models/__init__.py +4 -1
- mteb/models/abs_encoder.py +21 -17
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
- mteb/models/cache_wrappers/cache_wrapper.py +1 -1
- mteb/models/get_model_meta.py +3 -114
- mteb/models/instruct_wrapper.py +5 -1
- mteb/models/model_implementations/align_models.py +7 -0
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +65 -0
- mteb/models/model_implementations/ara_models.py +8 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +4 -0
- mteb/models/model_implementations/bge_models.py +60 -0
- mteb/models/model_implementations/bica_model.py +35 -0
- mteb/models/model_implementations/blip2_models.py +11 -0
- mteb/models/model_implementations/blip_models.py +27 -0
- mteb/models/model_implementations/bm25.py +1 -0
- mteb/models/model_implementations/bmretriever_models.py +4 -0
- mteb/models/model_implementations/cadet_models.py +9 -0
- mteb/models/model_implementations/cde_models.py +14 -0
- mteb/models/model_implementations/clip_models.py +3 -0
- mteb/models/model_implementations/clips_models.py +100 -0
- mteb/models/model_implementations/codefuse_models.py +162 -0
- mteb/models/model_implementations/codesage_models.py +15 -0
- mteb/models/model_implementations/cohere_models.py +8 -1
- mteb/models/model_implementations/cohere_v.py +5 -0
- mteb/models/model_implementations/colpali_models.py +14 -6
- mteb/models/model_implementations/colqwen_models.py +271 -1
- mteb/models/model_implementations/colsmol_models.py +2 -0
- mteb/models/model_implementations/conan_models.py +1 -0
- mteb/models/model_implementations/dino_models.py +171 -0
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +12 -101
- mteb/models/model_implementations/e5_v.py +1 -0
- mteb/models/model_implementations/eagerworks_models.py +164 -0
- mteb/models/model_implementations/emillykkejensen_models.py +91 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +32 -0
- mteb/models/model_implementations/evaclip_models.py +4 -0
- mteb/models/model_implementations/fa_models.py +58 -0
- mteb/models/model_implementations/facebookai.py +193 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +11 -5
- mteb/models/model_implementations/google_models.py +16 -5
- mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
- mteb/models/model_implementations/gritlm_models.py +2 -0
- mteb/models/model_implementations/gte_models.py +78 -0
- mteb/models/model_implementations/hinvec_models.py +1 -0
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +255 -2
- mteb/models/model_implementations/jina_clip.py +1 -0
- mteb/models/model_implementations/jina_models.py +209 -5
- mteb/models/model_implementations/kalm_models.py +203 -25
- mteb/models/model_implementations/kblab.py +31 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
- mteb/models/model_implementations/kfst.py +25 -0
- mteb/models/model_implementations/kowshik24_models.py +32 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +3 -2
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +3 -0
- mteb/models/model_implementations/llm2vec_models.py +8 -0
- mteb/models/model_implementations/mcinext_models.py +3 -0
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +362 -0
- mteb/models/model_implementations/mme5_models.py +1 -0
- mteb/models/model_implementations/moco_models.py +11 -0
- mteb/models/model_implementations/mod_models.py +191 -0
- mteb/models/model_implementations/model2vec_models.py +13 -0
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/mxbai_models.py +9 -0
- mteb/models/model_implementations/nbailab.py +70 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
- mteb/models/model_implementations/nomic_models.py +156 -4
- mteb/models/model_implementations/nomic_models_vision.py +7 -2
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
- mteb/models/model_implementations/nvidia_models.py +4 -1
- mteb/models/model_implementations/octen_models.py +195 -0
- mteb/models/model_implementations/openai_models.py +20 -16
- mteb/models/model_implementations/openclip_models.py +24 -0
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
- mteb/models/model_implementations/ops_moa_models.py +4 -2
- mteb/models/model_implementations/pawan_models.py +39 -0
- mteb/models/model_implementations/piccolo_models.py +8 -0
- mteb/models/model_implementations/promptriever_models.py +8 -4
- mteb/models/model_implementations/pylate_models.py +37 -4
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +6 -3
- mteb/models/model_implementations/qzhou_models.py +3 -1
- mteb/models/model_implementations/random_baseline.py +16 -21
- mteb/models/model_implementations/rasgaard_models.py +34 -0
- mteb/models/model_implementations/reasonir_model.py +1 -0
- mteb/models/model_implementations/repllama_models.py +2 -0
- mteb/models/model_implementations/rerankers_custom.py +3 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +51 -0
- mteb/models/model_implementations/ruri_models.py +322 -0
- mteb/models/model_implementations/salesforce_models.py +3 -0
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
- mteb/models/model_implementations/seed_models.py +1 -0
- mteb/models/model_implementations/sentence_transformers_models.py +57 -0
- mteb/models/model_implementations/shuu_model.py +32 -31
- mteb/models/model_implementations/siglip_models.py +10 -0
- mteb/models/model_implementations/sonar_models.py +1 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +376 -0
- mteb/models/model_implementations/ua_sentence_models.py +10 -0
- mteb/models/model_implementations/uae_models.py +1 -0
- mteb/models/model_implementations/vdr_models.py +2 -0
- mteb/models/model_implementations/vi_vn_models.py +39 -0
- mteb/models/model_implementations/vista_models.py +2 -0
- mteb/models/model_implementations/vlm2vec_models.py +2 -0
- mteb/models/model_implementations/voyage_models.py +15 -0
- mteb/models/model_implementations/voyage_v.py +8 -2
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +34 -0
- mteb/models/model_implementations/yuan_models_en.py +58 -0
- mteb/models/model_meta.py +442 -22
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
- mteb/models/search_wrappers.py +165 -48
- mteb/models/sentence_transformer_wrapper.py +2 -7
- mteb/results/benchmark_results.py +88 -47
- mteb/results/model_result.py +11 -4
- mteb/results/task_result.py +37 -19
- mteb/similarity_functions.py +49 -0
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
- mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
- mteb/tasks/classification/ara/ajgt.py +1 -2
- mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
- mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
- mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
- mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
- mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
- mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
- mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
- mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
- mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
- mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
- mteb/tasks/classification/eng/arxiv_classification.py +1 -2
- mteb/tasks/classification/eng/banking77_classification.py +1 -2
- mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
- mteb/tasks/classification/eng/emotion_classification.py +1 -2
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
- mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
- mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
- mteb/tasks/classification/eng/imdb_classification.py +1 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
- mteb/tasks/classification/eng/news_classification.py +1 -2
- mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
- mteb/tasks/classification/eng/patent_classification.py +1 -2
- mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
- mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
- mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
- mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
- mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
- mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
- mteb/tasks/classification/eng/ucf101_classification.py +1 -5
- mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
- mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
- mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
- mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
- mteb/tasks/classification/est/estonian_valence.py +1 -2
- mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
- mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
- mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
- mteb/tasks/classification/fra/french_book_reviews.py +1 -2
- mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
- mteb/tasks/classification/heb/__init__.py +6 -1
- mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
- mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
- mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
- mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
- mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
- mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
- mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
- mteb/tasks/classification/jpn/wrime_classification.py +1 -2
- mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
- mteb/tasks/classification/kor/klue_tc.py +1 -2
- mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
- mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
- mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
- mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
- mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
- mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
- mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
- mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
- mteb/tasks/classification/multilingual/scala_classification.py +1 -2
- mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
- mteb/tasks/classification/mya/myanmar_news.py +1 -2
- mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
- mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
- mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
- mteb/tasks/classification/nld/iconclass_classification.py +3 -0
- mteb/tasks/classification/nld/open_tender_classification.py +3 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
- mteb/tasks/classification/nob/no_rec_classification.py +1 -2
- mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
- mteb/tasks/classification/ory/odia_news_classification.py +1 -2
- mteb/tasks/classification/pol/polish_classification.py +3 -6
- mteb/tasks/classification/ron/moroco.py +1 -2
- mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
- mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
- mteb/tasks/classification/rus/georeview_classification.py +1 -2
- mteb/tasks/classification/rus/headline_classification.py +1 -2
- mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
- mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
- mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
- mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
- mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
- mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
- mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
- mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
- mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
- mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
- mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
- mteb/tasks/classification/swe/dalaj_classification.py +1 -2
- mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
- mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
- mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
- mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
- mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
- mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
- mteb/tasks/classification/tur/__init__.py +4 -0
- mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
- mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
- mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
- mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
- mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
- mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
- mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
- mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
- mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
- mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
- mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
- mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
- mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
- mteb/tasks/classification/zho/cmteb_classification.py +5 -10
- mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
- mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
- mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
- mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
- mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
- mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
- mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
- mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
- mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
- mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
- mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
- mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
- mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
- mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
- mteb/tasks/pair_classification/rus/__init__.py +2 -2
- mteb/tasks/pair_classification/rus/terra.py +51 -25
- mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
- mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
- mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
- mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
- mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
- mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
- mteb/tasks/retrieval/kor/__init__.py +2 -1
- mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
- mteb/tasks/retrieval/multilingual/__init__.py +22 -0
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
- mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
- mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
- mteb/tasks/retrieval/nld/__init__.py +8 -4
- mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
- mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
- mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
- mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
- mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
- mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
- mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
- mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
- mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
- mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
- mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
- mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
- mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
- mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
- mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
- mteb/types/_encoder_io.py +7 -2
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
|
@@ -5,14 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
|
|
|
5
5
|
class KorHateClassification(AbsTaskClassification):
|
|
6
6
|
metadata = TaskMetadata(
|
|
7
7
|
name="KorHateClassification",
|
|
8
|
-
description="
|
|
9
|
-
toxic speech detection from a Korean online entertainment news aggregator. Recently,
|
|
10
|
-
two young Korean celebrities suffered from a series of tragic incidents that led to two
|
|
11
|
-
major Korean web portals to close the comments section on their platform. However, this only
|
|
12
|
-
serves as a temporary solution, and the fundamental issue has not been solved yet. This dataset
|
|
13
|
-
hopes to improve Korean hate speech detection. Annotation was performed by 32 annotators,
|
|
14
|
-
consisting of 29 annotators from the crowdsourcing platform DeepNatural AI and three NLP researchers.
|
|
15
|
-
""",
|
|
8
|
+
description="The dataset was created to provide the first human-labeled Korean corpus for toxic speech detection from a Korean online entertainment news aggregator. Recently, two young Korean celebrities suffered from a series of tragic incidents that led to two major Korean web portals to close the comments section on their platform. However, this only serves as a temporary solution, and the fundamental issue has not been solved yet. This dataset hopes to improve Korean hate speech detection. Annotation was performed by 32 annotators, consisting of 29 annotators from the crowdsourcing platform DeepNatural AI and three NLP researchers.",
|
|
16
9
|
dataset={
|
|
17
10
|
"path": "mteb/KorHateClassification",
|
|
18
11
|
"revision": "a4e70398c3689a5f55cd1f4a447d8d2da0a7dd1e",
|
|
@@ -48,15 +41,7 @@ class KorHateClassification(AbsTaskClassification):
|
|
|
48
41
|
class KorHateClassificationV2(AbsTaskClassification):
|
|
49
42
|
metadata = TaskMetadata(
|
|
50
43
|
name="KorHateClassification.v2",
|
|
51
|
-
description="
|
|
52
|
-
toxic speech detection from a Korean online entertainment news aggregator. Recently,
|
|
53
|
-
two young Korean celebrities suffered from a series of tragic incidents that led to two
|
|
54
|
-
major Korean web portals to close the comments section on their platform. However, this only
|
|
55
|
-
serves as a temporary solution, and the fundamental issue has not been solved yet. This dataset
|
|
56
|
-
hopes to improve Korean hate speech detection. Annotation was performed by 32 annotators,
|
|
57
|
-
consisting of 29 annotators from the crowdsourcing platform DeepNatural AI and three NLP researchers.
|
|
58
|
-
|
|
59
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
44
|
+
description="The dataset was created to provide the first human-labeled Korean corpus for toxic speech detection from a Korean online entertainment news aggregator. Recently, two young Korean celebrities suffered from a series of tragic incidents that led to two major Korean web portals to close the comments section on their platform. However, this only serves as a temporary solution, and the fundamental issue has not been solved yet. This dataset hopes to improve Korean hate speech detection. Annotation was performed by 32 annotators, consisting of 29 annotators from the crowdsourcing platform DeepNatural AI and three NLP researchers. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
60
45
|
dataset={
|
|
61
46
|
"path": "mteb/kor_hate",
|
|
62
47
|
"revision": "5d64e6dcbe9204c934e9a3852b1130a6f2d51ad4",
|
|
@@ -5,15 +5,7 @@ from mteb.abstasks.task_metadata import TaskMetadata
|
|
|
5
5
|
class KorSarcasmClassification(AbsTaskClassification):
|
|
6
6
|
metadata = TaskMetadata(
|
|
7
7
|
name="KorSarcasmClassification",
|
|
8
|
-
description=""
|
|
9
|
-
The Korean Sarcasm Dataset was created to detect sarcasm in text, which can significantly alter the original
|
|
10
|
-
meaning of a sentence. 9319 tweets were collected from Twitter and labeled for sarcasm or not_sarcasm. These
|
|
11
|
-
tweets were gathered by querying for: irony sarcastic, and
|
|
12
|
-
sarcasm.
|
|
13
|
-
The dataset was created by gathering HTML data from Twitter. Queries for hashtags that include sarcasm
|
|
14
|
-
and variants of it were used to return tweets. It was preprocessed by removing the keyword
|
|
15
|
-
hashtag, urls and mentions of the user to preserve anonymity.
|
|
16
|
-
""",
|
|
8
|
+
description="The Korean Sarcasm Dataset was created to detect sarcasm in text, which can significantly alter the original meaning of a sentence. 9319 tweets were collected from Twitter and labeled for sarcasm or not_sarcasm. These tweets were gathered by querying for: irony sarcastic, and sarcasm. The dataset was created by gathering HTML data from Twitter. Queries for hashtags that include sarcasm and variants of it were used to return tweets. It was preprocessed by removing the keyword hashtag, urls and mentions of the user to preserve anonymity.",
|
|
17
9
|
dataset={
|
|
18
10
|
"path": "mteb/KorSarcasmClassification",
|
|
19
11
|
"revision": "6701f384372c04aa8c64b10582e72eb84135a1d4",
|
|
@@ -49,16 +41,7 @@ class KorSarcasmClassification(AbsTaskClassification):
|
|
|
49
41
|
class KorSarcasmClassificationV2(AbsTaskClassification):
|
|
50
42
|
metadata = TaskMetadata(
|
|
51
43
|
name="KorSarcasmClassification.v2",
|
|
52
|
-
description=""
|
|
53
|
-
The Korean Sarcasm Dataset was created to detect sarcasm in text, which can significantly alter the original
|
|
54
|
-
meaning of a sentence. 9319 tweets were collected from Twitter and labeled for sarcasm or not_sarcasm. These
|
|
55
|
-
tweets were gathered by querying for: irony sarcastic, and
|
|
56
|
-
sarcasm.
|
|
57
|
-
The dataset was created by gathering HTML data from Twitter. Queries for hashtags that include sarcasm
|
|
58
|
-
and variants of it were used to return tweets. It was preprocessed by removing the keyword
|
|
59
|
-
hashtag, urls and mentions of the user to preserve anonymity.
|
|
60
|
-
|
|
61
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
44
|
+
description="The Korean Sarcasm Dataset was created to detect sarcasm in text, which can significantly alter the original meaning of a sentence. 9319 tweets were collected from Twitter and labeled for sarcasm or not_sarcasm. These tweets were gathered by querying for: irony sarcastic, and sarcasm. The dataset was created by gathering HTML data from Twitter. Queries for hashtags that include sarcasm and variants of it were used to return tweets. It was preprocessed by removing the keyword hashtag, urls and mentions of the user to preserve anonymity. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
62
45
|
dataset={
|
|
63
46
|
"path": "mteb/kor_sarcasm",
|
|
64
47
|
"revision": "0e5e17b4dba569776e445f5639ba13dc406b2b0e",
|
|
@@ -42,8 +42,7 @@ class KurdishSentimentClassification(AbsTaskClassification):
|
|
|
42
42
|
class KurdishSentimentClassificationV2(AbsTaskClassification):
|
|
43
43
|
metadata = TaskMetadata(
|
|
44
44
|
name="KurdishSentimentClassification.v2",
|
|
45
|
-
description="
|
|
46
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
45
|
+
description="Kurdish Sentiment Dataset This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
47
46
|
reference="https://link.springer.com/article/10.1007/s10579-023-09716-6",
|
|
48
47
|
dataset={
|
|
49
48
|
"path": "mteb/kurdish_sentiment",
|
|
@@ -42,8 +42,7 @@ class MalayalamNewsClassification(AbsTaskClassification):
|
|
|
42
42
|
class MalayalamNewsClassificationV2(AbsTaskClassification):
|
|
43
43
|
metadata = TaskMetadata(
|
|
44
44
|
name="MalayalamNewsClassification.v2",
|
|
45
|
-
description="
|
|
46
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
45
|
+
description="A Malayalam dataset for 3-class classification of Malayalam news articles This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
47
46
|
reference="https://github.com/goru001/nlp-for-malyalam",
|
|
48
47
|
dataset={
|
|
49
48
|
"path": "mteb/malayalam_news",
|
|
@@ -43,8 +43,7 @@ class MarathiNewsClassification(AbsTaskClassification):
|
|
|
43
43
|
class MarathiNewsClassificationV2(AbsTaskClassification):
|
|
44
44
|
metadata = TaskMetadata(
|
|
45
45
|
name="MarathiNewsClassification.v2",
|
|
46
|
-
description="
|
|
47
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
46
|
+
description="A Marathi dataset for 3-class classification of Marathi news articles This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
48
47
|
reference="https://github.com/goru001/nlp-for-marathi",
|
|
49
48
|
dataset={
|
|
50
49
|
"path": "mteb/marathi_news",
|
|
@@ -49,8 +49,7 @@ Bontcheva, Kalina},
|
|
|
49
49
|
class MacedonianTweetSentimentClassificationV2(AbsTaskClassification):
|
|
50
50
|
metadata = TaskMetadata(
|
|
51
51
|
name="MacedonianTweetSentimentClassification.v2",
|
|
52
|
-
description="
|
|
53
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
52
|
+
description="An Macedonian dataset for tweet sentiment classification. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
54
53
|
reference="https://aclanthology.org/R15-1034/",
|
|
55
54
|
dataset={
|
|
56
55
|
"path": "mteb/macedonian_tweet_sentiment",
|
|
@@ -10,12 +10,7 @@ _LANGS = {
|
|
|
10
10
|
class CataloniaTweetClassification(AbsTaskClassification):
|
|
11
11
|
metadata = TaskMetadata(
|
|
12
12
|
name="CataloniaTweetClassification",
|
|
13
|
-
description="
|
|
14
|
-
messages for automatic stance detection. The data was collected over 12 days during February and March
|
|
15
|
-
of 2019 from tweets posted in Barcelona, and during September of 2018 from tweets posted in the town of Terrassa, Catalonia.
|
|
16
|
-
Each corpus is annotated with three classes: AGAINST, FAVOR and NEUTRAL, which express the stance
|
|
17
|
-
towards the target - independence of Catalonia.
|
|
18
|
-
""",
|
|
13
|
+
description="This dataset contains two corpora in Spanish and Catalan that consist of annotated Twitter messages for automatic stance detection. The data was collected over 12 days during February and March of 2019 from tweets posted in Barcelona, and during September of 2018 from tweets posted in the town of Terrassa, Catalonia. Each corpus is annotated with three classes: AGAINST, FAVOR and NEUTRAL, which express the stance towards the target - independence of Catalonia.",
|
|
19
14
|
reference="https://aclanthology.org/2020.lrec-1.171/",
|
|
20
15
|
dataset={
|
|
21
16
|
"path": "community-datasets/catalonia_independence",
|
|
@@ -24,10 +24,7 @@ class MultiHateClassification(AbsTaskClassification):
|
|
|
24
24
|
"path": "mteb/multi-hatecheck",
|
|
25
25
|
"revision": "8f95949846bb9e33c6aaf730ccfdb8fe6bcfb7a9",
|
|
26
26
|
},
|
|
27
|
-
description="
|
|
28
|
-
(hateful vs non-hateful) labels. Includes 25+ distinct types of hate
|
|
29
|
-
and challenging non-hate, and 11 languages.
|
|
30
|
-
""",
|
|
27
|
+
description="Hate speech detection dataset with binary (hateful vs non-hateful) labels. Includes 25+ distinct types of hate and challenging non-hate, and 11 languages.",
|
|
31
28
|
reference="https://aclanthology.org/2022.woah-1.15/",
|
|
32
29
|
type="Classification",
|
|
33
30
|
category="t2c",
|
|
@@ -9,11 +9,7 @@ class RuSciBenchCoreRiscClassification(AbsTaskClassification):
|
|
|
9
9
|
"path": "mlsa-iai-msu-lab/ru_sci_bench_mteb",
|
|
10
10
|
"revision": "fbc0599a0b5f00b3c7d87ab4d13490f04fb77f8e",
|
|
11
11
|
},
|
|
12
|
-
description="
|
|
13
|
-
(based on its title and abstract) belongs to the Core of the Russian Science Citation Index (RISC).
|
|
14
|
-
The RISC includes a wide range of publications, but the Core RISC comprises the most cited and prestigious
|
|
15
|
-
journals, dissertations, theses, monographs, and studies. The task is provided for both Russian and English
|
|
16
|
-
versions of the paper's title and abstract.""",
|
|
12
|
+
description="This binary classification task aims to determine whether a scientific paper (based on its title and abstract) belongs to the Core of the Russian Science Citation Index (RISC). The RISC includes a wide range of publications, but the Core RISC comprises the most cited and prestigious journals, dissertations, theses, monographs, and studies. The task is provided for both Russian and English versions of the paper's title and abstract.",
|
|
17
13
|
reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb",
|
|
18
14
|
type="Classification",
|
|
19
15
|
category="t2c",
|
|
@@ -57,10 +53,7 @@ class RuSciBenchPubTypeClassification(AbsTaskClassification):
|
|
|
57
53
|
"path": "mlsa-iai-msu-lab/ru_sci_bench_mteb",
|
|
58
54
|
"revision": "fbc0599a0b5f00b3c7d87ab4d13490f04fb77f8e",
|
|
59
55
|
},
|
|
60
|
-
description="
|
|
61
|
-
into different publication types. The dataset identifies the following types:
|
|
62
|
-
'Article', 'Conference proceedings', 'Survey', 'Miscellanea', 'Short message', 'Review', and 'Personalia'.
|
|
63
|
-
This task is available for both Russian and English versions of the paper's title and abstract.""",
|
|
56
|
+
description="This task involves classifying scientific papers (based on their title and abstract) into different publication types. The dataset identifies the following types: 'Article', 'Conference proceedings', 'Survey', 'Miscellanea', 'Short message', 'Review', and 'Personalia'. This task is available for both Russian and English versions of the paper's title and abstract.",
|
|
64
57
|
reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb",
|
|
65
58
|
type="Classification",
|
|
66
59
|
category="t2c",
|
|
@@ -104,13 +97,7 @@ class RuSciBenchGRNTIClassificationV2(AbsTaskClassification):
|
|
|
104
97
|
"path": "mlsa-iai-msu-lab/ru_sci_bench_mteb",
|
|
105
98
|
"revision": "fbc0599a0b5f00b3c7d87ab4d13490f04fb77f8e",
|
|
106
99
|
},
|
|
107
|
-
description="
|
|
108
|
-
Technical Information) rubricator. GRNTI is a universal hierarchical classification of knowledge domains
|
|
109
|
-
adopted in Russia and CIS countries to systematize the entire flow of scientific and technical information.
|
|
110
|
-
This task uses the first level of the GRNTI hierarchy and top 28 classes by frequency.
|
|
111
|
-
|
|
112
|
-
In this version, English language support has been added and data partitioning has been slightly modified.
|
|
113
|
-
""",
|
|
100
|
+
description="Classification of scientific papers based on the GRNTI (State Rubricator of Scientific and Technical Information) rubricator. GRNTI is a universal hierarchical classification of knowledge domains adopted in Russia and CIS countries to systematize the entire flow of scientific and technical information. This task uses the first level of the GRNTI hierarchy and top 28 classes by frequency. In this version, English language support has been added and data partitioning has been slightly modified.",
|
|
114
101
|
reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb",
|
|
115
102
|
type="Classification",
|
|
116
103
|
category="t2c",
|
|
@@ -154,13 +141,7 @@ class RuSciBenchOECDClassificationV2(AbsTaskClassification):
|
|
|
154
141
|
"path": "mlsa-iai-msu-lab/ru_sci_bench_mteb",
|
|
155
142
|
"revision": "fbc0599a0b5f00b3c7d87ab4d13490f04fb77f8e",
|
|
156
143
|
},
|
|
157
|
-
description="
|
|
158
|
-
(Organization for Economic Co-operation and Development) rubricator. OECD provides
|
|
159
|
-
a hierarchical 3-level system of classes for labeling scientific articles.
|
|
160
|
-
This task uses the first two levels of the OECD hierarchy, top 29 classes.
|
|
161
|
-
|
|
162
|
-
In this version, English language support has been added and data partitioning has been slightly modified.
|
|
163
|
-
""",
|
|
144
|
+
description="Classification of scientific papers based on the OECD (Organization for Economic Co-operation and Development) rubricator. OECD provides a hierarchical 3-level system of classes for labeling scientific articles. This task uses the first two levels of the OECD hierarchy, top 29 classes. In this version, English language support has been added and data partitioning has been slightly modified.",
|
|
164
145
|
reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb",
|
|
165
146
|
type="Classification",
|
|
166
147
|
category="t2c",
|
|
@@ -12,8 +12,7 @@ _LANGS = {
|
|
|
12
12
|
class ScalaClassification(AbsTaskClassification):
|
|
13
13
|
metadata = TaskMetadata(
|
|
14
14
|
name="ScalaClassification",
|
|
15
|
-
description="
|
|
16
|
-
Published as part of 'ScandEval: A Benchmark for Scandinavian Natural Language Processing'""",
|
|
15
|
+
description="ScaLa a linguistic acceptability dataset for the mainland Scandinavian languages automatically constructed from dependency annotations in Universal Dependencies Treebanks. Published as part of 'ScandEval: A Benchmark for Scandinavian Natural Language Processing'",
|
|
17
16
|
reference="https://aclanthology.org/2023.nodalida-1.20/",
|
|
18
17
|
dataset={
|
|
19
18
|
"path": "mteb/multilingual-scala-classification",
|
|
@@ -205,12 +205,7 @@ _LANGS = {
|
|
|
205
205
|
class SIB200Classification(AbsTaskClassification):
|
|
206
206
|
metadata = TaskMetadata(
|
|
207
207
|
name="SIB200Classification",
|
|
208
|
-
description="
|
|
209
|
-
dataset based on Flores-200 covering 205 languages and dialects annotated. The dataset is
|
|
210
|
-
annotated in English for the topics, science/technology, travel, politics, sports,
|
|
211
|
-
health, entertainment, and geography. The labels are then transferred to the other languages
|
|
212
|
-
in Flores-200 which are human-translated.
|
|
213
|
-
""",
|
|
208
|
+
description="SIB-200 is the largest publicly available topic classification dataset based on Flores-200 covering 205 languages and dialects annotated. The dataset is annotated in English for the topics, science/technology, travel, politics, sports, health, entertainment, and geography. The labels are then transferred to the other languages in Flores-200 which are human-translated.",
|
|
214
209
|
reference="https://arxiv.org/abs/2309.07445",
|
|
215
210
|
dataset={
|
|
216
211
|
"path": "mteb/sib200",
|
|
@@ -45,8 +45,7 @@ class MyanmarNewsV2(AbsTaskClassification):
|
|
|
45
45
|
"path": "mteb/myanmar_news",
|
|
46
46
|
"revision": "475b43ffbdb5138ad67a01a2c860bc7db502f3c5",
|
|
47
47
|
},
|
|
48
|
-
description="
|
|
49
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
48
|
+
description="The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categories, providing a rich resource for natural language processing applications involving Burmese which is a low resource language. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
50
49
|
reference="https://huggingface.co/datasets/myanmar_news",
|
|
51
50
|
type="Classification",
|
|
52
51
|
category="t2c",
|
|
@@ -57,8 +57,7 @@ Tan, Liling},
|
|
|
57
57
|
class NepaliNewsClassificationV2(AbsTaskClassification):
|
|
58
58
|
metadata = TaskMetadata(
|
|
59
59
|
name="NepaliNewsClassification.v2",
|
|
60
|
-
description="
|
|
61
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
60
|
+
description="A Nepali dataset for 7500 news articles This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
62
61
|
reference="https://github.com/goru001/nlp-for-nepali",
|
|
63
62
|
dataset={
|
|
64
63
|
"path": "mteb/nepali_news",
|
|
@@ -48,8 +48,7 @@ Suzan, Verberne},
|
|
|
48
48
|
class DutchBookReviewSentimentClassificationV2(AbsTaskClassification):
|
|
49
49
|
metadata = TaskMetadata(
|
|
50
50
|
name="DutchBookReviewSentimentClassification.v2",
|
|
51
|
-
description="
|
|
52
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
51
|
+
description="A Dutch book review for sentiment classification. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900). Additionally, a Dutch prompt was included.",
|
|
53
52
|
reference="https://github.com/benjaminvdb/DBRD",
|
|
54
53
|
dataset={
|
|
55
54
|
"path": "mteb/dutch_book_review_sentiment",
|
|
@@ -86,4 +85,7 @@ Suzan, Verberne},
|
|
|
86
85
|
}
|
|
87
86
|
""",
|
|
88
87
|
adapted_from=["DutchBookReviewSentimentClassification"],
|
|
88
|
+
prompt={
|
|
89
|
+
"query": "Classificeer de gegeven boekrecensie als positieve of negatieve sentiment"
|
|
90
|
+
},
|
|
89
91
|
)
|
|
@@ -27,6 +27,9 @@ class DutchSarcasticHeadlinesClassification(AbsTaskClassification):
|
|
|
27
27
|
dialect=[],
|
|
28
28
|
sample_creation="found",
|
|
29
29
|
bibtex_citation="""""",
|
|
30
|
+
prompt={
|
|
31
|
+
"query": "Classificeer de gegeven krantenkop als sarcastisch of niet sarcastisch"
|
|
32
|
+
},
|
|
30
33
|
)
|
|
31
34
|
|
|
32
35
|
def dataset_transform(self):
|
|
@@ -64,8 +64,7 @@ Tokunaga, Takenobu},
|
|
|
64
64
|
class NoRecClassificationV2(AbsTaskClassification):
|
|
65
65
|
metadata = TaskMetadata(
|
|
66
66
|
name="NoRecClassification.v2",
|
|
67
|
-
description="
|
|
68
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
67
|
+
description="A Norwegian dataset for sentiment classification on review This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
69
68
|
reference="https://aclanthology.org/L18-1661/",
|
|
70
69
|
dataset={
|
|
71
70
|
# using the mini version to keep results ~comparable to the ScandEval benchmark
|
|
@@ -51,8 +51,7 @@ Brygfjeld, Svein Arne},
|
|
|
51
51
|
class NorwegianParliamentClassificationV2(AbsTaskClassification):
|
|
52
52
|
metadata = TaskMetadata(
|
|
53
53
|
name="NorwegianParliamentClassification.v2",
|
|
54
|
-
description="
|
|
55
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
54
|
+
description="Norwegian parliament speeches annotated for sentiment This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
56
55
|
reference="https://huggingface.co/datasets/NbAiLab/norwegian_parliament",
|
|
57
56
|
dataset={
|
|
58
57
|
"path": "mteb/norwegian_parliament",
|
|
@@ -43,8 +43,7 @@ class OdiaNewsClassification(AbsTaskClassification):
|
|
|
43
43
|
class OdiaNewsClassificationV2(AbsTaskClassification):
|
|
44
44
|
metadata = TaskMetadata(
|
|
45
45
|
name="OdiaNewsClassification.v2",
|
|
46
|
-
description="
|
|
47
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
46
|
+
description="A Odia dataset for 3-class classification of Odia news articles This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
48
47
|
reference="https://github.com/goru001/nlp-for-odia",
|
|
49
48
|
dataset={
|
|
50
49
|
"path": "mteb/odia_news",
|
|
@@ -42,8 +42,7 @@ class CbdClassification(AbsTaskClassification):
|
|
|
42
42
|
class CbdClassificationV2(AbsTaskClassification):
|
|
43
43
|
metadata = TaskMetadata(
|
|
44
44
|
name="CBD.v2",
|
|
45
|
-
description="
|
|
46
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
45
|
+
description="Polish Tweets annotated for cyberbullying detection. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
47
46
|
reference="http://2019.poleval.pl/files/poleval2019.pdf",
|
|
48
47
|
dataset={
|
|
49
48
|
"path": "mteb/cbd",
|
|
@@ -274,8 +273,7 @@ Tetreault, Joel},
|
|
|
274
273
|
class AllegroReviewsClassificationV2(AbsTaskClassification):
|
|
275
274
|
metadata = TaskMetadata(
|
|
276
275
|
name="AllegroReviews.v2",
|
|
277
|
-
description="
|
|
278
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
276
|
+
description="A Polish dataset for sentiment classification on reviews from e-commerce marketplace Allegro. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
279
277
|
reference="https://aclanthology.org/2020.acl-main.111.pdf",
|
|
280
278
|
dataset={
|
|
281
279
|
"path": "mteb/allegro_reviews",
|
|
@@ -362,8 +360,7 @@ class PacClassification(AbsTaskClassification):
|
|
|
362
360
|
class PacClassificationV2(AbsTaskClassification):
|
|
363
361
|
metadata = TaskMetadata(
|
|
364
362
|
name="PAC.v2",
|
|
365
|
-
description="
|
|
366
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
363
|
+
description="Polish Paraphrase Corpus This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
367
364
|
reference="https://arxiv.org/pdf/2211.13112.pdf",
|
|
368
365
|
dataset={
|
|
369
366
|
"path": "mteb/pac",
|
|
@@ -47,8 +47,7 @@ class MorocoV2(AbsTaskClassification):
|
|
|
47
47
|
"path": "mteb/moroco",
|
|
48
48
|
"revision": "6e70588dbd3d583da8b85989c1c3ab3d4bd2e7c4",
|
|
49
49
|
},
|
|
50
|
-
description="
|
|
51
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
50
|
+
description="The Moldavian and Romanian Dialectal Corpus. The MOROCO data set contains Moldavian and Romanian samples of text collected from the news domain. The samples belong to one of the following six topics: (0) culture, (1) finance, (2) politics, (3) science, (4) sports, (5) tech This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
52
51
|
reference="https://huggingface.co/datasets/moroco",
|
|
53
52
|
type="Classification",
|
|
54
53
|
category="t2c",
|
|
@@ -39,8 +39,7 @@ class RomanianReviewsSentiment(AbsTaskClassification):
|
|
|
39
39
|
class RomanianReviewsSentimentV2(AbsTaskClassification):
|
|
40
40
|
metadata = TaskMetadata(
|
|
41
41
|
name="RomanianReviewsSentiment.v2",
|
|
42
|
-
description="
|
|
43
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
42
|
+
description="LaRoSeDa (A Large Romanian Sentiment Data Set) contains 15,000 reviews written in Romanian This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
44
43
|
reference="https://arxiv.org/abs/2101.04197",
|
|
45
44
|
dataset={
|
|
46
45
|
"path": "mteb/romanian_reviews_sentiment",
|
|
@@ -41,8 +41,7 @@ class RomanianSentimentClassification(AbsTaskClassification):
|
|
|
41
41
|
class RomanianSentimentClassificationV2(AbsTaskClassification):
|
|
42
42
|
metadata = TaskMetadata(
|
|
43
43
|
name="RomanianSentimentClassification.v2",
|
|
44
|
-
description="
|
|
45
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
44
|
+
description="An Romanian dataset for sentiment classification. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
46
45
|
reference="https://arxiv.org/abs/2009.08712",
|
|
47
46
|
dataset={
|
|
48
47
|
"path": "mteb/romanian_sentiment",
|
|
@@ -37,8 +37,7 @@ class GeoreviewClassificationV2(AbsTaskClassification):
|
|
|
37
37
|
"path": "mteb/georeview",
|
|
38
38
|
"revision": "5194395f82217bc31212fd6a275002fb405f9dfb",
|
|
39
39
|
},
|
|
40
|
-
description="
|
|
41
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
40
|
+
description="Review classification (5-point scale) based on Yandex Georeview dataset This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
42
41
|
reference="https://github.com/yandex/geo-reviews-dataset-2023",
|
|
43
42
|
type="Classification",
|
|
44
43
|
category="t2c",
|
|
@@ -66,8 +66,7 @@ class HeadlineClassificationV2(AbsTaskClassification):
|
|
|
66
66
|
"path": "mteb/headline",
|
|
67
67
|
"revision": "6bd88e7778ee2e3bd8d0ade1be3ad5b6d969145a",
|
|
68
68
|
},
|
|
69
|
-
description="
|
|
70
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
69
|
+
description="Headline rubric classification based on the paraphraser plus dataset. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
71
70
|
reference="https://aclanthology.org/2020.ngt-1.6/",
|
|
72
71
|
type="Classification",
|
|
73
72
|
category="t2c",
|
|
@@ -70,8 +70,7 @@ class InappropriatenessClassificationV2(AbsTaskClassification):
|
|
|
70
70
|
"path": "mteb/inappropriateness",
|
|
71
71
|
"revision": "2bdbb71d9b972709173f1477d7dd33c3d67f51ac",
|
|
72
72
|
},
|
|
73
|
-
description="
|
|
74
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
73
|
+
description="Inappropriateness identification in the form of binary classification This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
75
74
|
reference="https://aclanthology.org/2021.bsnlp-1.4",
|
|
76
75
|
type="Classification",
|
|
77
76
|
category="t2c",
|
|
@@ -55,8 +55,7 @@ class RuReviewsClassificationV2(AbsTaskClassification):
|
|
|
55
55
|
"path": "mteb/ru_reviews",
|
|
56
56
|
"revision": "46d80ee5ac51be8234725558677e59050b9c418e",
|
|
57
57
|
},
|
|
58
|
-
description="
|
|
59
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
58
|
+
description="Product review classification (3-point scale) based on RuRevies dataset This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
60
59
|
reference="https://github.com/sismetanin/rureviews",
|
|
61
60
|
type="Classification",
|
|
62
61
|
category="t2c",
|
|
@@ -39,8 +39,7 @@ class RuToxicOKMLCUPClassificationV2(AbsTaskClassification):
|
|
|
39
39
|
"path": "mteb/ru_toxic_okmlcup",
|
|
40
40
|
"revision": "729025d2cfa68fcbc587ea80014a42d569cd9048",
|
|
41
41
|
},
|
|
42
|
-
description="
|
|
43
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
42
|
+
description="On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
44
43
|
reference="https://cups.online/ru/contests/okmlcup2020",
|
|
45
44
|
type="Classification",
|
|
46
45
|
category="t2t",
|
|
@@ -46,8 +46,7 @@ class SentiRuEval2016ClassificationV2(AbsTaskClassification):
|
|
|
46
46
|
"path": "mteb/senti_ru_eval2016",
|
|
47
47
|
"revision": "bfa4cbec1753ffed29a8244a4ec208cc9e6c09a0",
|
|
48
48
|
},
|
|
49
|
-
description="
|
|
50
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
49
|
+
description="Russian sentiment analysis evaluation SentiRuEval-2016 devoted to reputation monitoring of banks and telecom companies in Twitter. We describe the task, data, the procedure of data preparation, and participants’ results. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
51
50
|
reference="https://github.com/mokoron/sentirueval",
|
|
52
51
|
type="Classification",
|
|
53
52
|
category="t2t",
|
|
@@ -54,8 +54,7 @@ class SinhalaNewsClassification(AbsTaskClassification):
|
|
|
54
54
|
class SinhalaNewsClassificationV2(AbsTaskClassification):
|
|
55
55
|
metadata = TaskMetadata(
|
|
56
56
|
name="SinhalaNewsClassification.v2",
|
|
57
|
-
description="
|
|
58
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
57
|
+
description="This file contains news texts (sentences) belonging to 5 different news categories (political, business, technology, sports and Entertainment). The original dataset was released by Nisansa de Silva (Sinhala Text Classification: Observations from the Perspective of a Resource Poor Language, 2015). This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
59
58
|
dataset={
|
|
60
59
|
"path": "mteb/sinhala_news",
|
|
61
60
|
"revision": "e0b6e93ed5f086fe358595dff1aaad9eb877667a",
|
|
@@ -45,8 +45,7 @@ class SinhalaNewsSourceClassification(AbsTaskClassification):
|
|
|
45
45
|
class SinhalaNewsSourceClassificationV2(AbsTaskClassification):
|
|
46
46
|
metadata = TaskMetadata(
|
|
47
47
|
name="SinhalaNewsSourceClassification.v2",
|
|
48
|
-
description="
|
|
49
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
48
|
+
description="This dataset contains Sinhala news headlines extracted from 9 news sources (websites) (Sri Lanka Army, Dinamina, GossipLanka, Hiru, ITN, Lankapuwath, NewsLK, Newsfirst, World Socialist Web Site-Sinhala). This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
50
49
|
dataset={
|
|
51
50
|
"path": "mteb/sinhala_news_source",
|
|
52
51
|
"revision": "6902767dbfa6189cbe5f5b5b56ee6300b1702d33",
|
|
@@ -54,8 +54,7 @@ class CSFDSKMovieReviewSentimentClassification(AbsTaskClassification):
|
|
|
54
54
|
class CSFDSKMovieReviewSentimentClassificationV2(AbsTaskClassification):
|
|
55
55
|
metadata = TaskMetadata(
|
|
56
56
|
name="CSFDSKMovieReviewSentimentClassification.v2",
|
|
57
|
-
description="
|
|
58
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
57
|
+
description="The dataset contains 30k user reviews from csfd.cz in Slovak. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
59
58
|
reference="https://arxiv.org/abs/2304.01922",
|
|
60
59
|
dataset={
|
|
61
60
|
"path": "mteb/csfdsk_movie_review_sentiment",
|
|
@@ -32,8 +32,7 @@ class SlovakHateSpeechClassification(AbsTaskClassification):
|
|
|
32
32
|
class SlovakHateSpeechClassificationV2(AbsTaskClassification):
|
|
33
33
|
metadata = TaskMetadata(
|
|
34
34
|
name="SlovakHateSpeechClassification.v2",
|
|
35
|
-
description="
|
|
36
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
35
|
+
description="The dataset contains posts from a social network with human annotations for hateful or offensive language in Slovak. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
37
36
|
reference="https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak",
|
|
38
37
|
dataset={
|
|
39
38
|
"path": "mteb/slovak_hate_speech",
|
|
@@ -46,8 +46,7 @@ class SlovakMovieReviewSentimentClassification(AbsTaskClassification):
|
|
|
46
46
|
class SlovakMovieReviewSentimentClassificationV2(AbsTaskClassification):
|
|
47
47
|
metadata = TaskMetadata(
|
|
48
48
|
name="SlovakMovieReviewSentimentClassification.v2",
|
|
49
|
-
description="
|
|
50
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
49
|
+
description="User reviews of movies on the CSFD movie database, with 2 sentiment classes (positive, negative) This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
51
50
|
reference="https://arxiv.org/pdf/2304.01922",
|
|
52
51
|
dataset={
|
|
53
52
|
"path": "mteb/slovak_movie_review_sentiment",
|
|
@@ -42,8 +42,7 @@ class FrenkSlClassification(AbsTaskClassification):
|
|
|
42
42
|
class FrenkSlClassificationV2(AbsTaskClassification):
|
|
43
43
|
metadata = TaskMetadata(
|
|
44
44
|
name="FrenkSlClassification.v2",
|
|
45
|
-
description="
|
|
46
|
-
This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
|
|
45
|
+
description="Slovenian subset of the FRENK dataset. Also available on HuggingFace dataset hub: English subset, Croatian subset. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
|
|
47
46
|
dataset={
|
|
48
47
|
"path": "mteb/frenk_sl",
|
|
49
48
|
"revision": "3b69facc14651fbd152fda173683a7ecf9125b82",
|