mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -12,8 +12,7 @@ _LANGS = {
12
12
  class ScalaClassification(AbsTaskClassification):
13
13
  metadata = TaskMetadata(
14
14
  name="ScalaClassification",
15
- description="""ScaLa a linguistic acceptability dataset for the mainland Scandinavian languages automatically constructed from dependency annotations in Universal Dependencies Treebanks.
16
- Published as part of 'ScandEval: A Benchmark for Scandinavian Natural Language Processing'""",
15
+ description="ScaLa a linguistic acceptability dataset for the mainland Scandinavian languages automatically constructed from dependency annotations in Universal Dependencies Treebanks. Published as part of 'ScandEval: A Benchmark for Scandinavian Natural Language Processing'",
17
16
  reference="https://aclanthology.org/2023.nodalida-1.20/",
18
17
  dataset={
19
18
  "path": "mteb/multilingual-scala-classification",
@@ -58,7 +57,7 @@ Fishel, Mark},
58
57
  def dataset_transform(self):
59
58
  for lang in self.dataset.keys():
60
59
  # convert label to a 0/1 label
61
- labels = self.dataset[lang]["train"]["label"] # type: ignore
60
+ labels = self.dataset[lang]["train"]["label"]
62
61
  lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
63
62
  self.dataset[lang] = self.dataset[lang].map(
64
63
  lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
@@ -205,12 +205,7 @@ _LANGS = {
205
205
  class SIB200Classification(AbsTaskClassification):
206
206
  metadata = TaskMetadata(
207
207
  name="SIB200Classification",
208
- description="""SIB-200 is the largest publicly available topic classification
209
- dataset based on Flores-200 covering 205 languages and dialects annotated. The dataset is
210
- annotated in English for the topics, science/technology, travel, politics, sports,
211
- health, entertainment, and geography. The labels are then transferred to the other languages
212
- in Flores-200 which are human-translated.
213
- """,
208
+ description="SIB-200 is the largest publicly available topic classification dataset based on Flores-200 covering 205 languages and dialects annotated. The dataset is annotated in English for the topics, science/technology, travel, politics, sports, health, entertainment, and geography. The labels are then transferred to the other languages in Flores-200 which are human-translated.",
214
209
  reference="https://arxiv.org/abs/2309.07445",
215
210
  dataset={
216
211
  "path": "mteb/sib200",
@@ -45,8 +45,7 @@ class MyanmarNewsV2(AbsTaskClassification):
45
45
  "path": "mteb/myanmar_news",
46
46
  "revision": "475b43ffbdb5138ad67a01a2c860bc7db502f3c5",
47
47
  },
48
- description="""The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categories, providing a rich resource for natural language processing applications involving Burmese which is a low resource language.
49
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
48
+ description="The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categories, providing a rich resource for natural language processing applications involving Burmese which is a low resource language. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
50
49
  reference="https://huggingface.co/datasets/myanmar_news",
51
50
  type="Classification",
52
51
  category="t2c",
@@ -57,8 +57,7 @@ Tan, Liling},
57
57
  class NepaliNewsClassificationV2(AbsTaskClassification):
58
58
  metadata = TaskMetadata(
59
59
  name="NepaliNewsClassification.v2",
60
- description="""A Nepali dataset for 7500 news articles
61
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
60
+ description="A Nepali dataset for 7500 news articles This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
62
61
  reference="https://github.com/goru001/nlp-for-nepali",
63
62
  dataset={
64
63
  "path": "mteb/nepali_news",
@@ -48,8 +48,7 @@ Suzan, Verberne},
48
48
  class DutchBookReviewSentimentClassificationV2(AbsTaskClassification):
49
49
  metadata = TaskMetadata(
50
50
  name="DutchBookReviewSentimentClassification.v2",
51
- description="""A Dutch book review for sentiment classification.
52
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
51
+ description="A Dutch book review for sentiment classification. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900). Additionally, a Dutch prompt was included.",
53
52
  reference="https://github.com/benjaminvdb/DBRD",
54
53
  dataset={
55
54
  "path": "mteb/dutch_book_review_sentiment",
@@ -86,4 +85,7 @@ Suzan, Verberne},
86
85
  }
87
86
  """,
88
87
  adapted_from=["DutchBookReviewSentimentClassification"],
88
+ prompt={
89
+ "query": "Classificeer de gegeven boekrecensie als positieve of negatieve sentiment"
90
+ },
89
91
  )
@@ -35,4 +35,7 @@ class DutchColaClassification(AbsTaskClassification):
35
35
  year = {2024},
36
36
  }
37
37
  """,
38
+ prompt={
39
+ "query": "Classificeer de gegeven zin als grammaticaal aanvaardbaar of niet aanvaardbaar"
40
+ },
38
41
  )
@@ -34,4 +34,7 @@ class DutchGovernmentBiasClassification(AbsTaskClassification):
34
34
  year = {2025},
35
35
  }
36
36
  """,
37
+ prompt={
38
+ "query": "Classificeer het gegeven overheidsdocument als bevooroordeeld of niet bevooroordeeld"
39
+ },
37
40
  )
@@ -27,4 +27,7 @@ class DutchNewsArticlesClassification(AbsTaskClassification):
27
27
  dialect=[],
28
28
  sample_creation="found",
29
29
  bibtex_citation="",
30
+ prompt={
31
+ "query": "Classificeer het gegeven nieuwsartikel in het juiste onderwerp of thema"
32
+ },
30
33
  )
@@ -27,6 +27,9 @@ class DutchSarcasticHeadlinesClassification(AbsTaskClassification):
27
27
  dialect=[],
28
28
  sample_creation="found",
29
29
  bibtex_citation="""""",
30
+ prompt={
31
+ "query": "Classificeer de gegeven krantenkop als sarcastisch of niet sarcastisch"
32
+ },
30
33
  )
31
34
 
32
35
  def dataset_transform(self):
@@ -38,4 +38,7 @@ class IconclassClassification(AbsTaskClassification):
38
38
  year = {2023},
39
39
  }
40
40
  """,
41
+ prompt={
42
+ "query": "Classificeer de gegeven titel van het kunstwerk in het juiste onderwerp of thema"
43
+ },
41
44
  )
@@ -35,4 +35,7 @@ class OpenTenderClassification(AbsTaskClassification):
35
35
  year = {2025},
36
36
  }
37
37
  """,
38
+ prompt={
39
+ "query": "Classificeer de gegeven aanbestedingsbeschrijving in het juiste onderwerp of thema"
40
+ },
38
41
  )
@@ -37,6 +37,9 @@ class VaccinChatNLClassification(AbsTaskClassification):
37
37
  year = {2022},
38
38
  }
39
39
  """,
40
+ prompt={
41
+ "query": "Gegeven een gebruikersuiting als query, bepaal de gebruikersintenties"
42
+ },
40
43
  )
41
44
 
42
45
  def dataset_transform(self):
@@ -64,8 +64,7 @@ Tokunaga, Takenobu},
64
64
  class NoRecClassificationV2(AbsTaskClassification):
65
65
  metadata = TaskMetadata(
66
66
  name="NoRecClassification.v2",
67
- description="""A Norwegian dataset for sentiment classification on review
68
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
67
+ description="A Norwegian dataset for sentiment classification on review This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
69
68
  reference="https://aclanthology.org/L18-1661/",
70
69
  dataset={
71
70
  # using the mini version to keep results ~comparable to the ScandEval benchmark
@@ -51,8 +51,7 @@ Brygfjeld, Svein Arne},
51
51
  class NorwegianParliamentClassificationV2(AbsTaskClassification):
52
52
  metadata = TaskMetadata(
53
53
  name="NorwegianParliamentClassification.v2",
54
- description="""Norwegian parliament speeches annotated for sentiment
55
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
54
+ description="Norwegian parliament speeches annotated for sentiment This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
56
55
  reference="https://huggingface.co/datasets/NbAiLab/norwegian_parliament",
57
56
  dataset={
58
57
  "path": "mteb/norwegian_parliament",
@@ -43,8 +43,7 @@ class OdiaNewsClassification(AbsTaskClassification):
43
43
  class OdiaNewsClassificationV2(AbsTaskClassification):
44
44
  metadata = TaskMetadata(
45
45
  name="OdiaNewsClassification.v2",
46
- description="""A Odia dataset for 3-class classification of Odia news articles
47
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
46
+ description="A Odia dataset for 3-class classification of Odia news articles This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
48
47
  reference="https://github.com/goru001/nlp-for-odia",
49
48
  dataset={
50
49
  "path": "mteb/odia_news",
@@ -42,8 +42,7 @@ class CbdClassification(AbsTaskClassification):
42
42
  class CbdClassificationV2(AbsTaskClassification):
43
43
  metadata = TaskMetadata(
44
44
  name="CBD.v2",
45
- description="""Polish Tweets annotated for cyberbullying detection.
46
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
45
+ description="Polish Tweets annotated for cyberbullying detection. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
47
46
  reference="http://2019.poleval.pl/files/poleval2019.pdf",
48
47
  dataset={
49
48
  "path": "mteb/cbd",
@@ -274,8 +273,7 @@ Tetreault, Joel},
274
273
  class AllegroReviewsClassificationV2(AbsTaskClassification):
275
274
  metadata = TaskMetadata(
276
275
  name="AllegroReviews.v2",
277
- description="""A Polish dataset for sentiment classification on reviews from e-commerce marketplace Allegro.
278
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
276
+ description="A Polish dataset for sentiment classification on reviews from e-commerce marketplace Allegro. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
279
277
  reference="https://aclanthology.org/2020.acl-main.111.pdf",
280
278
  dataset={
281
279
  "path": "mteb/allegro_reviews",
@@ -362,8 +360,7 @@ class PacClassification(AbsTaskClassification):
362
360
  class PacClassificationV2(AbsTaskClassification):
363
361
  metadata = TaskMetadata(
364
362
  name="PAC.v2",
365
- description="""Polish Paraphrase Corpus
366
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
363
+ description="Polish Paraphrase Corpus This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
367
364
  reference="https://arxiv.org/pdf/2211.13112.pdf",
368
365
  dataset={
369
366
  "path": "mteb/pac",
@@ -47,8 +47,7 @@ class MorocoV2(AbsTaskClassification):
47
47
  "path": "mteb/moroco",
48
48
  "revision": "6e70588dbd3d583da8b85989c1c3ab3d4bd2e7c4",
49
49
  },
50
- description="""The Moldavian and Romanian Dialectal Corpus. The MOROCO data set contains Moldavian and Romanian samples of text collected from the news domain. The samples belong to one of the following six topics: (0) culture, (1) finance, (2) politics, (3) science, (4) sports, (5) tech
51
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
50
+ description="The Moldavian and Romanian Dialectal Corpus. The MOROCO data set contains Moldavian and Romanian samples of text collected from the news domain. The samples belong to one of the following six topics: (0) culture, (1) finance, (2) politics, (3) science, (4) sports, (5) tech This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
52
51
  reference="https://huggingface.co/datasets/moroco",
53
52
  type="Classification",
54
53
  category="t2c",
@@ -39,8 +39,7 @@ class RomanianReviewsSentiment(AbsTaskClassification):
39
39
  class RomanianReviewsSentimentV2(AbsTaskClassification):
40
40
  metadata = TaskMetadata(
41
41
  name="RomanianReviewsSentiment.v2",
42
- description="""LaRoSeDa (A Large Romanian Sentiment Data Set) contains 15,000 reviews written in Romanian
43
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
42
+ description="LaRoSeDa (A Large Romanian Sentiment Data Set) contains 15,000 reviews written in Romanian This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
44
43
  reference="https://arxiv.org/abs/2101.04197",
45
44
  dataset={
46
45
  "path": "mteb/romanian_reviews_sentiment",
@@ -41,8 +41,7 @@ class RomanianSentimentClassification(AbsTaskClassification):
41
41
  class RomanianSentimentClassificationV2(AbsTaskClassification):
42
42
  metadata = TaskMetadata(
43
43
  name="RomanianSentimentClassification.v2",
44
- description="""An Romanian dataset for sentiment classification.
45
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
44
+ description="An Romanian dataset for sentiment classification. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
46
45
  reference="https://arxiv.org/abs/2009.08712",
47
46
  dataset={
48
47
  "path": "mteb/romanian_sentiment",
@@ -37,8 +37,7 @@ class GeoreviewClassificationV2(AbsTaskClassification):
37
37
  "path": "mteb/georeview",
38
38
  "revision": "5194395f82217bc31212fd6a275002fb405f9dfb",
39
39
  },
40
- description="""Review classification (5-point scale) based on Yandex Georeview dataset
41
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
40
+ description="Review classification (5-point scale) based on Yandex Georeview dataset This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
42
41
  reference="https://github.com/yandex/geo-reviews-dataset-2023",
43
42
  type="Classification",
44
43
  category="t2c",
@@ -66,8 +66,7 @@ class HeadlineClassificationV2(AbsTaskClassification):
66
66
  "path": "mteb/headline",
67
67
  "revision": "6bd88e7778ee2e3bd8d0ade1be3ad5b6d969145a",
68
68
  },
69
- description="""Headline rubric classification based on the paraphraser plus dataset.
70
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
69
+ description="Headline rubric classification based on the paraphraser plus dataset. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
71
70
  reference="https://aclanthology.org/2020.ngt-1.6/",
72
71
  type="Classification",
73
72
  category="t2c",
@@ -70,8 +70,7 @@ class InappropriatenessClassificationV2(AbsTaskClassification):
70
70
  "path": "mteb/inappropriateness",
71
71
  "revision": "2bdbb71d9b972709173f1477d7dd33c3d67f51ac",
72
72
  },
73
- description="""Inappropriateness identification in the form of binary classification
74
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
73
+ description="Inappropriateness identification in the form of binary classification This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
75
74
  reference="https://aclanthology.org/2021.bsnlp-1.4",
76
75
  type="Classification",
77
76
  category="t2c",
@@ -55,8 +55,7 @@ class RuReviewsClassificationV2(AbsTaskClassification):
55
55
  "path": "mteb/ru_reviews",
56
56
  "revision": "46d80ee5ac51be8234725558677e59050b9c418e",
57
57
  },
58
- description="""Product review classification (3-point scale) based on RuRevies dataset
59
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
58
+ description="Product review classification (3-point scale) based on RuRevies dataset This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
60
59
  reference="https://github.com/sismetanin/rureviews",
61
60
  type="Classification",
62
61
  category="t2c",
@@ -39,8 +39,7 @@ class RuToxicOKMLCUPClassificationV2(AbsTaskClassification):
39
39
  "path": "mteb/ru_toxic_okmlcup",
40
40
  "revision": "729025d2cfa68fcbc587ea80014a42d569cd9048",
41
41
  },
42
- description="""On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day.
43
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
42
+ description="On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
44
43
  reference="https://cups.online/ru/contests/okmlcup2020",
45
44
  type="Classification",
46
45
  category="t2t",
@@ -46,8 +46,7 @@ class SentiRuEval2016ClassificationV2(AbsTaskClassification):
46
46
  "path": "mteb/senti_ru_eval2016",
47
47
  "revision": "bfa4cbec1753ffed29a8244a4ec208cc9e6c09a0",
48
48
  },
49
- description="""Russian sentiment analysis evaluation SentiRuEval-2016 devoted to reputation monitoring of banks and telecom companies in Twitter. We describe the task, data, the procedure of data preparation, and participants’ results.
50
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
49
+ description="Russian sentiment analysis evaluation SentiRuEval-2016 devoted to reputation monitoring of banks and telecom companies in Twitter. We describe the task, data, the procedure of data preparation, and participants’ results. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
51
50
  reference="https://github.com/mokoron/sentirueval",
52
51
  type="Classification",
53
52
  category="t2t",
@@ -54,8 +54,7 @@ class SinhalaNewsClassification(AbsTaskClassification):
54
54
  class SinhalaNewsClassificationV2(AbsTaskClassification):
55
55
  metadata = TaskMetadata(
56
56
  name="SinhalaNewsClassification.v2",
57
- description="""This file contains news texts (sentences) belonging to 5 different news categories (political, business, technology, sports and Entertainment). The original dataset was released by Nisansa de Silva (Sinhala Text Classification: Observations from the Perspective of a Resource Poor Language, 2015).
58
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
57
+ description="This file contains news texts (sentences) belonging to 5 different news categories (political, business, technology, sports and Entertainment). The original dataset was released by Nisansa de Silva (Sinhala Text Classification: Observations from the Perspective of a Resource Poor Language, 2015). This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
59
58
  dataset={
60
59
  "path": "mteb/sinhala_news",
61
60
  "revision": "e0b6e93ed5f086fe358595dff1aaad9eb877667a",
@@ -45,8 +45,7 @@ class SinhalaNewsSourceClassification(AbsTaskClassification):
45
45
  class SinhalaNewsSourceClassificationV2(AbsTaskClassification):
46
46
  metadata = TaskMetadata(
47
47
  name="SinhalaNewsSourceClassification.v2",
48
- description="""This dataset contains Sinhala news headlines extracted from 9 news sources (websites) (Sri Lanka Army, Dinamina, GossipLanka, Hiru, ITN, Lankapuwath, NewsLK, Newsfirst, World Socialist Web Site-Sinhala).
49
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
48
+ description="This dataset contains Sinhala news headlines extracted from 9 news sources (websites) (Sri Lanka Army, Dinamina, GossipLanka, Hiru, ITN, Lankapuwath, NewsLK, Newsfirst, World Socialist Web Site-Sinhala). This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
50
49
  dataset={
51
50
  "path": "mteb/sinhala_news_source",
52
51
  "revision": "6902767dbfa6189cbe5f5b5b56ee6300b1702d33",
@@ -54,8 +54,7 @@ class CSFDSKMovieReviewSentimentClassification(AbsTaskClassification):
54
54
  class CSFDSKMovieReviewSentimentClassificationV2(AbsTaskClassification):
55
55
  metadata = TaskMetadata(
56
56
  name="CSFDSKMovieReviewSentimentClassification.v2",
57
- description="""The dataset contains 30k user reviews from csfd.cz in Slovak.
58
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
57
+ description="The dataset contains 30k user reviews from csfd.cz in Slovak. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
59
58
  reference="https://arxiv.org/abs/2304.01922",
60
59
  dataset={
61
60
  "path": "mteb/csfdsk_movie_review_sentiment",
@@ -32,8 +32,7 @@ class SlovakHateSpeechClassification(AbsTaskClassification):
32
32
  class SlovakHateSpeechClassificationV2(AbsTaskClassification):
33
33
  metadata = TaskMetadata(
34
34
  name="SlovakHateSpeechClassification.v2",
35
- description="""The dataset contains posts from a social network with human annotations for hateful or offensive language in Slovak.
36
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
35
+ description="The dataset contains posts from a social network with human annotations for hateful or offensive language in Slovak. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
37
36
  reference="https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak",
38
37
  dataset={
39
38
  "path": "mteb/slovak_hate_speech",
@@ -46,8 +46,7 @@ class SlovakMovieReviewSentimentClassification(AbsTaskClassification):
46
46
  class SlovakMovieReviewSentimentClassificationV2(AbsTaskClassification):
47
47
  metadata = TaskMetadata(
48
48
  name="SlovakMovieReviewSentimentClassification.v2",
49
- description="""User reviews of movies on the CSFD movie database, with 2 sentiment classes (positive, negative)
50
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
49
+ description="User reviews of movies on the CSFD movie database, with 2 sentiment classes (positive, negative) This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
51
50
  reference="https://arxiv.org/pdf/2304.01922",
52
51
  dataset={
53
52
  "path": "mteb/slovak_movie_review_sentiment",
@@ -42,8 +42,7 @@ class FrenkSlClassification(AbsTaskClassification):
42
42
  class FrenkSlClassificationV2(AbsTaskClassification):
43
43
  metadata = TaskMetadata(
44
44
  name="FrenkSlClassification.v2",
45
- description="""Slovenian subset of the FRENK dataset. Also available on HuggingFace dataset hub: English subset, Croatian subset.
46
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
45
+ description="Slovenian subset of the FRENK dataset. Also available on HuggingFace dataset hub: English subset, Croatian subset. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
47
46
  dataset={
48
47
  "path": "mteb/frenk_sl",
49
48
  "revision": "3b69facc14651fbd152fda173683a7ecf9125b82",
@@ -39,8 +39,7 @@ class SpanishNewsClassification(AbsTaskClassification):
39
39
  class SpanishNewsClassificationV2(AbsTaskClassification):
40
40
  metadata = TaskMetadata(
41
41
  name="SpanishNewsClassification.v2",
42
- description="""A Spanish dataset for news classification. The dataset includes articles from reputable Spanish news sources spanning 12 different categories.
43
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
42
+ description="A Spanish dataset for news classification. The dataset includes articles from reputable Spanish news sources spanning 12 different categories. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
44
43
  reference="https://huggingface.co/datasets/MarcOrfilaCarreras/spanish-news",
45
44
  dataset={
46
45
  "path": "mteb/spanish_news",
@@ -56,8 +56,7 @@ Vylomova, Ekaterina},
56
56
  class SpanishSentimentClassificationV2(AbsTaskClassification):
57
57
  metadata = TaskMetadata(
58
58
  name="SpanishSentimentClassification.v2",
59
- description="""A Spanish dataset for sentiment classification.
60
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
59
+ description="A Spanish dataset for sentiment classification. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
61
60
  reference="https://huggingface.co/datasets/sepidmnorozy/Spanish_sentiment",
62
61
  dataset={
63
62
  "path": "mteb/spanish_sentiment",
@@ -45,8 +45,7 @@ class SiswatiNewsClassification(AbsTaskClassification):
45
45
  class SiswatiNewsClassificationV2(AbsTaskClassification):
46
46
  metadata = TaskMetadata(
47
47
  name="SiswatiNewsClassification.v2",
48
- description="""Siswati News Classification Dataset
49
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
48
+ description="Siswati News Classification Dataset This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
50
49
  reference="https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news",
51
50
  dataset={
52
51
  "path": "mteb/siswati_news",
@@ -49,8 +49,7 @@ class SwahiliNewsClassification(AbsTaskClassification):
49
49
  class SwahiliNewsClassificationV2(AbsTaskClassification):
50
50
  metadata = TaskMetadata(
51
51
  name="SwahiliNewsClassification.v2",
52
- description="""Dataset for Swahili News Classification, categorized with 6 domains (Local News (Kitaifa), International News (Kimataifa), Finance News (Uchumi), Health News (Afya), Sports News (Michezo), and Entertainment News (Burudani)). Building and Optimizing Swahili Language Models: Techniques, Embeddings, and Datasets
53
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
52
+ description="Dataset for Swahili News Classification, categorized with 6 domains (Local News (Kitaifa), International News (Kimataifa), Finance News (Uchumi), Health News (Afya), Sports News (Michezo), and Entertainment News (Burudani)). Building and Optimizing Swahili Language Models: Techniques, Embeddings, and Datasets This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
54
53
  reference="https://huggingface.co/datasets/Mollel/SwahiliNewsClassification",
55
54
  dataset={
56
55
  "path": "mteb/swahili_news",
@@ -50,8 +50,7 @@ class DalajClassificationV2(AbsTaskClassification):
50
50
  "revision": "ecf6f2d83e8e85816ec3974896557a4aafce4f3e",
51
51
  "name": "dalaj",
52
52
  },
53
- description="""A Swedish dataset for linguistic acceptability. Available as a part of Superlim.
54
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
53
+ description="A Swedish dataset for linguistic acceptability. Available as a part of Superlim. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
55
54
  reference="https://spraakbanken.gu.se/en/resources/superlim",
56
55
  type="Classification",
57
56
  category="t2c",
@@ -47,8 +47,7 @@ Fishel, Mark},
47
47
  class SweRecClassificationV2(AbsTaskClassification):
48
48
  metadata = TaskMetadata(
49
49
  name="SweRecClassification.v2",
50
- description="""A Swedish dataset for sentiment classification on review
51
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
50
+ description="A Swedish dataset for sentiment classification on review This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
52
51
  reference="https://aclanthology.org/2023.nodalida-1.20/",
53
52
  dataset={
54
53
  "path": "mteb/swe_rec",
@@ -32,8 +32,7 @@ class SwedishSentimentClassification(AbsTaskClassification):
32
32
  class SwedishSentimentClassificationV2(AbsTaskClassification):
33
33
  metadata = TaskMetadata(
34
34
  name="SwedishSentimentClassification.v2",
35
- description="""Dataset of Swedish reviews scarped from various public available websites
36
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
35
+ description="Dataset of Swedish reviews scarped from various public available websites This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
37
36
  reference="https://huggingface.co/datasets/swedish_reviews",
38
37
  dataset={
39
38
  "path": "mteb/swedish_sentiment",
@@ -45,8 +45,7 @@ class TamilNewsClassification(AbsTaskClassification):
45
45
  class TamilNewsClassificationV2(AbsTaskClassification):
46
46
  metadata = TaskMetadata(
47
47
  name="TamilNewsClassification.v2",
48
- description="""A Tamil dataset for 6-class classification of Tamil news articles
49
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
48
+ description="A Tamil dataset for 6-class classification of Tamil news articles This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
50
49
  reference="https://github.com/vanangamudi/tamil-news-classification",
51
50
  dataset={
52
51
  "path": "mteb/tamil_news",
@@ -36,8 +36,7 @@ class TeluguAndhraJyotiNewsClassification(AbsTaskClassification):
36
36
  class TeluguAndhraJyotiNewsClassificationV2(AbsTaskClassification):
37
37
  metadata = TaskMetadata(
38
38
  name="TeluguAndhraJyotiNewsClassification.v2",
39
- description="""A Telugu dataset for 5-class classification of Telugu news articles
40
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
39
+ description="A Telugu dataset for 5-class classification of Telugu news articles This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
41
40
  reference="https://github.com/AnushaMotamarri/Telugu-Newspaper-Article-Dataset",
42
41
  dataset={
43
42
  "path": "mteb/telugu_andhra_jyoti_news",
@@ -46,8 +46,7 @@ Polpanumas, Charin},
46
46
  class WisesightSentimentClassificationV2(AbsTaskClassification):
47
47
  metadata = TaskMetadata(
48
48
  name="WisesightSentimentClassification.v2",
49
- description="""Wisesight Sentiment Corpus: Social media messages in Thai language with sentiment label (positive, neutral, negative, question)
50
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
49
+ description="Wisesight Sentiment Corpus: Social media messages in Thai language with sentiment label (positive, neutral, negative, question) This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
51
50
  reference="https://github.com/PyThaiNLP/wisesight-sentiment",
52
51
  dataset={
53
52
  "path": "mteb/wisesight_sentiment",
@@ -43,8 +43,7 @@ class TswanaNewsClassification(AbsTaskClassification):
43
43
  class TswanaNewsClassificationV2(AbsTaskClassification):
44
44
  metadata = TaskMetadata(
45
45
  name="TswanaNewsClassification.v2",
46
- description="""Tswana News Classification Dataset
47
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
46
+ description="Tswana News Classification Dataset This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
48
47
  reference="https://link.springer.com/chapter/10.1007/978-3-031-49002-6_17",
49
48
  dataset={
50
49
  "path": "mteb/tswana_news",
@@ -1,3 +1,6 @@
1
+ from .turkish_constitutional_court import (
2
+ TurkishConstitutionalCourtViolation,
3
+ )
1
4
  from .turkish_movie_sentiment_classification import (
2
5
  TurkishMovieSentimentClassification,
3
6
  TurkishMovieSentimentClassificationV2,
@@ -8,6 +11,7 @@ from .turkish_product_sentiment_classification import (
8
11
  )
9
12
 
10
13
  __all__ = [
14
+ "TurkishConstitutionalCourtViolation",
11
15
  "TurkishMovieSentimentClassification",
12
16
  "TurkishMovieSentimentClassificationV2",
13
17
  "TurkishProductSentimentClassification",
@@ -0,0 +1,41 @@
1
+ from mteb.abstasks.classification import AbsTaskClassification
2
+ from mteb.abstasks.task_metadata import TaskMetadata
3
+
4
+
5
+ class TurkishConstitutionalCourtViolation(AbsTaskClassification):
6
+ # Normalize column names after load_data renames them.
7
+ label_column_name = "label"
8
+ input_column_name = "text"
9
+
10
+ metadata = TaskMetadata(
11
+ name="TurkishConstitutionalCourtViolation",
12
+ description="Binary classification of Turkish constitutional court decisions: Violation vs No violation.",
13
+ reference="https://huggingface.co/datasets/KocLab-Bilkent/turkish-constitutional-court",
14
+ type="Classification",
15
+ category="t2c",
16
+ modalities=["text"],
17
+ eval_splits=["test"],
18
+ eval_langs=["tur-Latn"],
19
+ main_score="f1",
20
+ dataset={
21
+ "path": "denizgulal/turkish-constitutional-court-violation-clean",
22
+ "revision": "333f49b7ddc72fa4a86ec5bd756a28c585311c74",
23
+ },
24
+ date=("2000-01-01", "2023-02-20"), # dataset card last updated Feb 20, 2023
25
+ domains=["Legal", "Non-fiction"],
26
+ task_subtypes=["Political classification"],
27
+ license="cc-by-4.0",
28
+ annotations_creators="human-annotated",
29
+ dialect=[],
30
+ sample_creation="found",
31
+ bibtex_citation=r"""
32
+ @article{mumcuoglu2021natural,
33
+ author = {Mumcuoglu, Emre and Ozturk, Ceyhun E. and Ozaktas, Haldun M. and Koc, Aykut},
34
+ journal = {Information Processing and Management},
35
+ number = {5},
36
+ title = {Natural language processing in law: Prediction of outcomes in the higher courts of Turkey},
37
+ volume = {58},
38
+ year = {2021},
39
+ }
40
+ """,
41
+ )
@@ -45,8 +45,7 @@ class TurkishMovieSentimentClassification(AbsTaskClassification):
45
45
  class TurkishMovieSentimentClassificationV2(AbsTaskClassification):
46
46
  metadata = TaskMetadata(
47
47
  name="TurkishMovieSentimentClassification.v2",
48
- description="""Turkish Movie Review Dataset
49
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
48
+ description="Turkish Movie Review Dataset This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
50
49
  reference="https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf",
51
50
  dataset={
52
51
  "path": "mteb/turkish_movie_sentiment",
@@ -40,8 +40,7 @@ class TurkishProductSentimentClassification(AbsTaskClassification):
40
40
  class TurkishProductSentimentClassificationV2(AbsTaskClassification):
41
41
  metadata = TaskMetadata(
42
42
  name="TurkishProductSentimentClassification.v2",
43
- description="""Turkish Product Review Dataset
44
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
43
+ description="Turkish Product Review Dataset This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)",
45
44
  reference="https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf",
46
45
  dataset={
47
46
  "path": "mteb/turkish_product_sentiment",