mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/_create_dataloaders.py +6 -3
  3. mteb/_evaluators/any_sts_evaluator.py +21 -12
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +1 -1
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
  7. mteb/_evaluators/pair_classification_evaluator.py +30 -38
  8. mteb/_evaluators/sklearn_evaluator.py +15 -28
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  10. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  12. mteb/abstasks/_data_filter/__init__.py +0 -0
  13. mteb/abstasks/_data_filter/filters.py +125 -0
  14. mteb/abstasks/_data_filter/task_pipelines.py +102 -0
  15. mteb/abstasks/_statistics_calculation.py +6 -2
  16. mteb/abstasks/classification.py +0 -2
  17. mteb/abstasks/clustering.py +1 -1
  18. mteb/abstasks/clustering_legacy.py +3 -0
  19. mteb/abstasks/multilabel_classification.py +10 -3
  20. mteb/abstasks/pair_classification.py +8 -1
  21. mteb/abstasks/sts.py +7 -0
  22. mteb/abstasks/task_metadata.py +1 -0
  23. mteb/benchmarks/_create_table.py +84 -37
  24. mteb/benchmarks/benchmark.py +74 -15
  25. mteb/benchmarks/benchmarks/__init__.py +8 -0
  26. mteb/benchmarks/benchmarks/benchmarks.py +259 -15
  27. mteb/benchmarks/get_benchmark.py +2 -0
  28. mteb/cache.py +47 -10
  29. mteb/deprecated_evaluator.py +8 -13
  30. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  31. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  32. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  33. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  34. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  35. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  36. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  37. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  38. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  39. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  40. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  41. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  42. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  43. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  44. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  45. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  46. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  47. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  48. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  49. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  50. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  51. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  53. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  54. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  55. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  56. mteb/evaluate.py +65 -45
  57. mteb/leaderboard/app.py +268 -133
  58. mteb/leaderboard/benchmark_selector.py +14 -5
  59. mteb/leaderboard/figures.py +13 -15
  60. mteb/leaderboard/table.py +82 -17
  61. mteb/models/__init__.py +4 -1
  62. mteb/models/abs_encoder.py +21 -17
  63. mteb/models/cache_wrappers/__init__.py +2 -1
  64. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
  65. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  66. mteb/models/get_model_meta.py +3 -114
  67. mteb/models/instruct_wrapper.py +5 -1
  68. mteb/models/model_implementations/align_models.py +7 -0
  69. mteb/models/model_implementations/amazon_models.py +1 -0
  70. mteb/models/model_implementations/andersborges.py +65 -0
  71. mteb/models/model_implementations/ara_models.py +8 -0
  72. mteb/models/model_implementations/arctic_models.py +8 -0
  73. mteb/models/model_implementations/b1ade_models.py +1 -0
  74. mteb/models/model_implementations/bedrock_models.py +4 -0
  75. mteb/models/model_implementations/bge_models.py +60 -0
  76. mteb/models/model_implementations/bica_model.py +35 -0
  77. mteb/models/model_implementations/blip2_models.py +11 -0
  78. mteb/models/model_implementations/blip_models.py +27 -0
  79. mteb/models/model_implementations/bm25.py +1 -0
  80. mteb/models/model_implementations/bmretriever_models.py +4 -0
  81. mteb/models/model_implementations/cadet_models.py +9 -0
  82. mteb/models/model_implementations/cde_models.py +14 -0
  83. mteb/models/model_implementations/clip_models.py +3 -0
  84. mteb/models/model_implementations/clips_models.py +100 -0
  85. mteb/models/model_implementations/codefuse_models.py +162 -0
  86. mteb/models/model_implementations/codesage_models.py +15 -0
  87. mteb/models/model_implementations/cohere_models.py +8 -1
  88. mteb/models/model_implementations/cohere_v.py +5 -0
  89. mteb/models/model_implementations/colpali_models.py +14 -6
  90. mteb/models/model_implementations/colqwen_models.py +271 -1
  91. mteb/models/model_implementations/colsmol_models.py +2 -0
  92. mteb/models/model_implementations/conan_models.py +1 -0
  93. mteb/models/model_implementations/dino_models.py +171 -0
  94. mteb/models/model_implementations/e5_instruct.py +4 -0
  95. mteb/models/model_implementations/e5_models.py +12 -101
  96. mteb/models/model_implementations/e5_v.py +1 -0
  97. mteb/models/model_implementations/eagerworks_models.py +164 -0
  98. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  99. mteb/models/model_implementations/en_code_retriever.py +1 -0
  100. mteb/models/model_implementations/euler_models.py +32 -0
  101. mteb/models/model_implementations/evaclip_models.py +4 -0
  102. mteb/models/model_implementations/fa_models.py +58 -0
  103. mteb/models/model_implementations/facebookai.py +193 -0
  104. mteb/models/model_implementations/geogpt_models.py +1 -0
  105. mteb/models/model_implementations/gme_v_models.py +11 -5
  106. mteb/models/model_implementations/google_models.py +16 -5
  107. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
  108. mteb/models/model_implementations/gritlm_models.py +2 -0
  109. mteb/models/model_implementations/gte_models.py +78 -0
  110. mteb/models/model_implementations/hinvec_models.py +1 -0
  111. mteb/models/model_implementations/human.py +1 -0
  112. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  113. mteb/models/model_implementations/inf_models.py +2 -0
  114. mteb/models/model_implementations/jasper_models.py +255 -2
  115. mteb/models/model_implementations/jina_clip.py +1 -0
  116. mteb/models/model_implementations/jina_models.py +209 -5
  117. mteb/models/model_implementations/kalm_models.py +203 -25
  118. mteb/models/model_implementations/kblab.py +31 -0
  119. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  120. mteb/models/model_implementations/kfst.py +25 -0
  121. mteb/models/model_implementations/kowshik24_models.py +32 -0
  122. mteb/models/model_implementations/lens_models.py +2 -0
  123. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  124. mteb/models/model_implementations/linq_models.py +3 -2
  125. mteb/models/model_implementations/listconranker.py +1 -1
  126. mteb/models/model_implementations/llm2clip_models.py +3 -0
  127. mteb/models/model_implementations/llm2vec_models.py +8 -0
  128. mteb/models/model_implementations/mcinext_models.py +3 -0
  129. mteb/models/model_implementations/mdbr_models.py +2 -0
  130. mteb/models/model_implementations/misc_models.py +362 -0
  131. mteb/models/model_implementations/mme5_models.py +1 -0
  132. mteb/models/model_implementations/moco_models.py +11 -0
  133. mteb/models/model_implementations/mod_models.py +191 -0
  134. mteb/models/model_implementations/model2vec_models.py +13 -0
  135. mteb/models/model_implementations/moka_models.py +3 -0
  136. mteb/models/model_implementations/mxbai_models.py +9 -0
  137. mteb/models/model_implementations/nbailab.py +70 -0
  138. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  139. mteb/models/model_implementations/nomic_models.py +156 -4
  140. mteb/models/model_implementations/nomic_models_vision.py +7 -2
  141. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
  142. mteb/models/model_implementations/nvidia_models.py +4 -1
  143. mteb/models/model_implementations/octen_models.py +195 -0
  144. mteb/models/model_implementations/openai_models.py +20 -16
  145. mteb/models/model_implementations/openclip_models.py +24 -0
  146. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  147. mteb/models/model_implementations/ops_moa_models.py +4 -2
  148. mteb/models/model_implementations/pawan_models.py +39 -0
  149. mteb/models/model_implementations/piccolo_models.py +8 -0
  150. mteb/models/model_implementations/promptriever_models.py +8 -4
  151. mteb/models/model_implementations/pylate_models.py +37 -4
  152. mteb/models/model_implementations/qodo_models.py +2 -0
  153. mteb/models/model_implementations/qtack_models.py +1 -0
  154. mteb/models/model_implementations/qwen3_models.py +6 -3
  155. mteb/models/model_implementations/qzhou_models.py +3 -1
  156. mteb/models/model_implementations/random_baseline.py +16 -21
  157. mteb/models/model_implementations/rasgaard_models.py +34 -0
  158. mteb/models/model_implementations/reasonir_model.py +1 -0
  159. mteb/models/model_implementations/repllama_models.py +2 -0
  160. mteb/models/model_implementations/rerankers_custom.py +3 -3
  161. mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
  162. mteb/models/model_implementations/richinfoai_models.py +1 -0
  163. mteb/models/model_implementations/ru_sentence_models.py +51 -0
  164. mteb/models/model_implementations/ruri_models.py +322 -0
  165. mteb/models/model_implementations/salesforce_models.py +3 -0
  166. mteb/models/model_implementations/samilpwc_models.py +1 -0
  167. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  168. mteb/models/model_implementations/searchmap_models.py +1 -0
  169. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  170. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
  171. mteb/models/model_implementations/seed_models.py +1 -0
  172. mteb/models/model_implementations/sentence_transformers_models.py +57 -0
  173. mteb/models/model_implementations/shuu_model.py +32 -31
  174. mteb/models/model_implementations/siglip_models.py +10 -0
  175. mteb/models/model_implementations/sonar_models.py +1 -0
  176. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  177. mteb/models/model_implementations/stella_models.py +6 -0
  178. mteb/models/model_implementations/tarka_models.py +376 -0
  179. mteb/models/model_implementations/ua_sentence_models.py +10 -0
  180. mteb/models/model_implementations/uae_models.py +1 -0
  181. mteb/models/model_implementations/vdr_models.py +2 -0
  182. mteb/models/model_implementations/vi_vn_models.py +39 -0
  183. mteb/models/model_implementations/vista_models.py +2 -0
  184. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  185. mteb/models/model_implementations/voyage_models.py +15 -0
  186. mteb/models/model_implementations/voyage_v.py +8 -2
  187. mteb/models/model_implementations/xyz_models.py +1 -0
  188. mteb/models/model_implementations/youtu_models.py +1 -0
  189. mteb/models/model_implementations/yuan_models.py +34 -0
  190. mteb/models/model_implementations/yuan_models_en.py +58 -0
  191. mteb/models/model_meta.py +442 -22
  192. mteb/models/search_encoder_index/__init__.py +7 -0
  193. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  194. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  195. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  196. mteb/models/search_wrappers.py +165 -48
  197. mteb/models/sentence_transformer_wrapper.py +2 -7
  198. mteb/results/benchmark_results.py +88 -47
  199. mteb/results/model_result.py +11 -4
  200. mteb/results/task_result.py +37 -19
  201. mteb/similarity_functions.py +49 -0
  202. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  203. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  204. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  205. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  206. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  207. mteb/tasks/classification/ara/ajgt.py +1 -2
  208. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  209. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  210. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  211. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  212. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  213. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  214. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  215. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  216. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  217. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  218. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  219. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  220. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  221. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  222. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  223. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  224. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  225. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  226. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  227. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  228. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  229. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  230. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  231. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  232. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  233. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  234. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  235. mteb/tasks/classification/eng/news_classification.py +1 -2
  236. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  237. mteb/tasks/classification/eng/patent_classification.py +1 -2
  238. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  239. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  240. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  241. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  242. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  243. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  244. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  245. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  246. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  247. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  248. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  249. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  250. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  251. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  252. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  253. mteb/tasks/classification/est/estonian_valence.py +1 -2
  254. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  255. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  257. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  260. mteb/tasks/classification/heb/__init__.py +6 -1
  261. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  262. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  263. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  264. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  265. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  266. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  267. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  268. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  269. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  270. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  271. mteb/tasks/classification/kor/klue_tc.py +1 -2
  272. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  274. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  275. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  276. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  277. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  278. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  279. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  280. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  281. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  282. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  283. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  284. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  285. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  286. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  287. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  288. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  289. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  290. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  291. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  292. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  293. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  294. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  295. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  296. mteb/tasks/classification/pol/polish_classification.py +3 -6
  297. mteb/tasks/classification/ron/moroco.py +1 -2
  298. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  299. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  300. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  301. mteb/tasks/classification/rus/headline_classification.py +1 -2
  302. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  303. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  304. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  305. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  306. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  307. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  308. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  309. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  310. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  311. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  312. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  313. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  314. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  315. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  316. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  317. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  318. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  319. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  320. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  321. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  322. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  323. mteb/tasks/classification/tur/__init__.py +4 -0
  324. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  325. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  326. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  327. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  328. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  329. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  330. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  331. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  332. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  333. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  334. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  335. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  336. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  337. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  338. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  339. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  340. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  341. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  342. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  343. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  344. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  345. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  346. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  347. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  348. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  349. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  350. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  351. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  352. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  353. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  354. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  355. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  356. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  357. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  358. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  359. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  360. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  361. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  362. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  363. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  364. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  365. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  366. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  367. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  368. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  369. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  370. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  371. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  372. mteb/tasks/pair_classification/rus/terra.py +51 -25
  373. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  374. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  375. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  376. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  377. mteb/tasks/reranking/jpn/__init__.py +9 -1
  378. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  379. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  380. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  381. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  382. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  383. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  384. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  385. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  386. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  387. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  388. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  389. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  390. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  391. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  392. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  393. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  394. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  395. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  396. mteb/tasks/retrieval/kor/__init__.py +2 -1
  397. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  398. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  399. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  400. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  401. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  402. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  403. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  404. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  405. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  406. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  407. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  408. mteb/tasks/retrieval/nld/__init__.py +8 -4
  409. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  410. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  411. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  412. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  413. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  414. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  415. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  416. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  417. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  418. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  419. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  420. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  421. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  422. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  423. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  424. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  425. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  426. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  427. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  428. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  429. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  430. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  431. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  432. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  433. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  434. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  435. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  436. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  437. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  438. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  439. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  440. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  441. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  442. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  443. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  444. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  445. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  446. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  447. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  448. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  449. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  450. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  451. mteb/types/_encoder_io.py +7 -2
  452. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
  453. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
  454. mteb/models/model_implementations/nb_sbert.py +0 -25
  455. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
  456. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
  457. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
  458. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,102 @@
1
+ import logging
2
+
3
+ from datasets import DatasetDict
4
+
5
+ from mteb import TaskMetadata
6
+ from mteb.abstasks import AbsTaskClassification
7
+ from mteb.abstasks._data_filter.filters import (
8
+ deduplicate,
9
+ filter_empty,
10
+ filter_short,
11
+ filter_train_leakage,
12
+ filter_unclear_label,
13
+ split_train_test,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def clean_dataset(
20
+ ds: DatasetDict,
21
+ metadata: TaskMetadata,
22
+ train_split: str,
23
+ input_column: str,
24
+ label_column: str,
25
+ subset: str | None = None,
26
+ ) -> DatasetDict:
27
+ """Apply the full cleaning pipeline with logging."""
28
+ logger.info("[clean_dataset] Starting dataset cleaning pipeline...")
29
+
30
+ transforms = [
31
+ ("filter_empty", filter_empty),
32
+ ("deduplicate", deduplicate),
33
+ ]
34
+
35
+ skip_cjk_codes = {"zho", "jpn", "tha", "mya", "cmn"}
36
+ logger.info("[clean_dataset] Applying short-text filter")
37
+ cur_langs = (
38
+ metadata.eval_langs[subset]
39
+ if isinstance(metadata.eval_langs, dict) and subset
40
+ else metadata.eval_langs
41
+ )
42
+ apply_short = not any(lang.split("-")[0] in skip_cjk_codes for lang in cur_langs)
43
+ if apply_short:
44
+ logger.info("[clean_dataset] Applying short-text filter")
45
+ transforms.append(("filter_short", filter_short))
46
+
47
+ for split in [train_split, *metadata.eval_splits]:
48
+ if split not in ds:
49
+ logger.warning(f"[clean_dataset] Split '{split}' missing; skipping.")
50
+ continue
51
+
52
+ for name, fn in transforms:
53
+ before = len(ds[split])
54
+ ds[split] = fn(ds[split], input_column=input_column)
55
+ logger.info(
56
+ f"[clean_dataset:{split}] {name} removed={before - len(ds[split])}"
57
+ )
58
+
59
+ ds = split_train_test(ds, metadata, train_split, label_column)
60
+
61
+ for split in metadata.eval_splits:
62
+ if split == train_split:
63
+ continue
64
+ before = len(ds[split])
65
+ ds[split] = filter_train_leakage(ds[train_split], ds[split], input_column)
66
+ logger.info(
67
+ f"[clean_dataset:{split}] leakage_removed={before - len(ds[split])}"
68
+ )
69
+
70
+ ds = filter_unclear_label(ds, input_column=input_column, label_column=label_column)
71
+
72
+ logger.info("[clean_dataset] Cleaning pipeline complete.")
73
+ return ds
74
+
75
+
76
+ def process_classification(
77
+ task: AbsTaskClassification,
78
+ ) -> DatasetDict | dict[str, DatasetDict]:
79
+ """Process classification task dataset(s) with cleaning pipeline."""
80
+ if not task.data_loaded:
81
+ task.load_data()
82
+ if isinstance(task.dataset, DatasetDict):
83
+ return clean_dataset(
84
+ task.dataset,
85
+ task.metadata,
86
+ task.train_split,
87
+ task.input_column_name,
88
+ task.label_column_name,
89
+ subset=None,
90
+ )
91
+
92
+ new_ds = {}
93
+ for subset in task.dataset:
94
+ new_ds[subset] = clean_dataset(
95
+ task.dataset[subset],
96
+ task.metadata,
97
+ task.train_split,
98
+ task.input_column_name,
99
+ task.label_column_name,
100
+ subset=subset,
101
+ )
102
+ return new_ds
@@ -1,7 +1,8 @@
1
+ from __future__ import annotations
2
+
1
3
  import hashlib
2
4
  from collections import Counter
3
-
4
- from PIL import Image
5
+ from typing import TYPE_CHECKING
5
6
 
6
7
  from mteb.types import TopRankedDocumentsType
7
8
  from mteb.types.statistics import (
@@ -13,6 +14,9 @@ from mteb.types.statistics import (
13
14
  TopRankedStatistics,
14
15
  )
15
16
 
17
+ if TYPE_CHECKING:
18
+ from PIL import Image
19
+
16
20
 
17
21
  def calculate_text_statistics(texts: list[str]) -> TextStatistics:
18
22
  """Calculate descriptive statistics for a list of texts.
@@ -5,7 +5,6 @@ from typing import Any, TypedDict
5
5
 
6
6
  import numpy as np
7
7
  from datasets import Dataset, DatasetDict
8
- from PIL import ImageFile
9
8
  from sklearn.linear_model import LogisticRegression
10
9
  from sklearn.metrics import (
11
10
  accuracy_score,
@@ -32,7 +31,6 @@ from ._statistics_calculation import (
32
31
  )
33
32
  from .abstask import AbsTask
34
33
 
35
- ImageFile.LOAD_TRUNCATED_IMAGES = True
36
34
  logger = logging.getLogger(__name__)
37
35
 
38
36
 
@@ -200,7 +200,7 @@ class AbsTaskClustering(AbsTask):
200
200
  downsampled_dataset,
201
201
  self.metadata,
202
202
  input_column=self.input_column_name,
203
- batch_size=encode_kwargs["batch_size"],
203
+ **encode_kwargs,
204
204
  ),
205
205
  task_metadata=self.metadata,
206
206
  hf_subset=hf_subset,
@@ -89,6 +89,9 @@ class AbsTaskClusteringLegacy(AbsTask):
89
89
  prediction_folder: Path | None = None,
90
90
  **kwargs: Any,
91
91
  ) -> ScoresDict:
92
+ data_split = data_split.select_columns(
93
+ [self.input_column_name, self.label_column_name]
94
+ )
92
95
  # MTEB text clustering requires renaming and eval per subset.
93
96
  if self.metadata.modalities == ["text"]:
94
97
  all_metrics = []
@@ -14,6 +14,7 @@ from sklearn.preprocessing import MultiLabelBinarizer
14
14
  from typing_extensions import override
15
15
 
16
16
  from mteb._create_dataloaders import create_dataloader
17
+ from mteb._evaluators.classification_metrics import hamming_score
17
18
  from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
18
19
  from mteb.models import EncoderProtocol
19
20
 
@@ -40,11 +41,13 @@ class MultilabelClassificationMetrics(TypedDict):
40
41
  accuracy: Accuracy of the classifier.
41
42
  lrap: Label Ranking Average Precision (LRAP) score.
42
43
  f1: Macro F1 score.
44
+ hamming: Hamming score (label-based accuracy).
43
45
  """
44
46
 
45
47
  accuracy: float
46
48
  lrap: float
47
49
  f1: float
50
+ hamming: float
48
51
 
49
52
 
50
53
  class FullMultilabelClassificationMetrics(MultilabelClassificationMetrics):
@@ -112,7 +115,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
112
115
  unique_train_dataset,
113
116
  self.metadata,
114
117
  input_column=self.input_column_name,
115
- batch_size=encode_kwargs["batch_size"],
118
+ **encode_kwargs,
116
119
  )
117
120
 
118
121
  logger.info("Running multilabel classification - Encoding training set...")
@@ -141,7 +144,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
141
144
  test_dataset.select_columns(self.input_column_name),
142
145
  self.metadata,
143
146
  input_column=self.input_column_name,
144
- batch_size=encode_kwargs["batch_size"],
147
+ **encode_kwargs,
145
148
  )
146
149
 
147
150
  logger.info("Running multilabel classification - Encoding test set...")
@@ -157,7 +160,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
157
160
 
158
161
  logger.info("Running multilabel classification - Evaluating classifiers...")
159
162
  all_predictions = []
160
- for i_experiment, sample_indices in enumerate(train_samples):
163
+ for _, sample_indices in enumerate(train_samples):
161
164
  X_train = np.stack([unique_train_embeddings[idx] for idx in sample_indices])
162
165
  y_train = train_split.select(sample_indices)[self.label_column_name]
163
166
  y_train = binarizer.transform(y_train)
@@ -207,10 +210,12 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
207
210
  else:
208
211
  lrap = label_ranking_average_precision_score(y_test, y_pred)
209
212
  f1 = f1_score(y_test, y_pred, average="macro")
213
+ hamming = hamming_score(y_test, y_pred)
210
214
  return MultilabelClassificationMetrics(
211
215
  accuracy=accuracy,
212
216
  lrap=lrap,
213
217
  f1=f1,
218
+ hamming=hamming,
214
219
  )
215
220
 
216
221
  def _undersample_data_indices(
@@ -218,6 +223,8 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
218
223
  ) -> tuple[list[int], list[int]]:
219
224
  """Undersample data to have samples_per_label samples of each label.
220
225
 
226
+ Currently ensures that each label has at least samples_per_label samples.
227
+
221
228
  Returns:
222
229
  A tuple containing:
223
230
  - List of sampled indices.
@@ -19,6 +19,7 @@ from mteb.abstasks._statistics_calculation import (
19
19
  from mteb.abstasks.abstask import AbsTask
20
20
  from mteb.models.model_meta import ScoringFunction
21
21
  from mteb.models.models_protocols import EncoderProtocol
22
+ from mteb.types import PromptType
22
23
  from mteb.types.statistics import (
23
24
  ImageStatistics,
24
25
  LabelStatistics,
@@ -35,7 +36,7 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics):
35
36
  Attributes:
36
37
  num_samples: number of samples in the dataset.
37
38
  number_of_characters: Total number of symbols in the dataset.
38
- unique_text_pairs: Number of unique pairs
39
+ unique_pairs: Number of unique pairs
39
40
 
40
41
  text1_statistics: Statistics for sentence1
41
42
  text2_statistics: Statistics for sentence2
@@ -65,12 +66,16 @@ class AbsTaskPairClassification(AbsTask):
65
66
  input2_column_name: The name of the column containing the second sentence in the pair.
66
67
  label_column_name: The name of the column containing the labels for the pairs. Labels should be 0 or 1.
67
68
  abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
69
+ input1_prompt_type: Type of prompt of first input. Used for asymmetric tasks.
70
+ input2_prompt_type: Type of prompt of second input. Used for asymmetric tasks.
68
71
  """
69
72
 
70
73
  abstask_prompt = "Retrieve text that are semantically similar to the given text."
71
74
  input1_column_name: str = "sentence1"
72
75
  input2_column_name: str = "sentence2"
73
76
  label_column_name: str = "labels"
77
+ input1_prompt_type: PromptType | None = None
78
+ input2_prompt_type: PromptType | None = None
74
79
 
75
80
  def _evaluate_subset(
76
81
  self,
@@ -93,6 +98,8 @@ class AbsTaskPairClassification(AbsTask):
93
98
  task_metadata=self.metadata,
94
99
  hf_split=hf_split,
95
100
  hf_subset=hf_subset,
101
+ input1_prompt_type=self.input1_prompt_type,
102
+ input2_prompt_type=self.input2_prompt_type,
96
103
  **kwargs,
97
104
  )
98
105
  similarity_scores = evaluator(model, encode_kwargs=encode_kwargs)
mteb/abstasks/sts.py CHANGED
@@ -8,6 +8,7 @@ from scipy.stats import pearsonr, spearmanr
8
8
  from mteb._evaluators import AnySTSEvaluator
9
9
  from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
10
10
  from mteb.models import EncoderProtocol
11
+ from mteb.types import PromptType
11
12
  from mteb.types.statistics import (
12
13
  ImageStatistics,
13
14
  ScoreStatistics,
@@ -89,12 +90,16 @@ class AbsTaskSTS(AbsTask):
89
90
  min_score: Minimum possible score in the dataset.
90
91
  max_score: Maximum possible score in the dataset.
91
92
  abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
93
+ input1_prompt_type: Type of prompt of first input. Used for asymmetric tasks.
94
+ input2_prompt_type: Type of prompt of second input. Used for asymmetric tasks.
92
95
  """
93
96
 
94
97
  abstask_prompt = "Retrieve semantically similar text."
95
98
  column_names: tuple[str, str] = ("sentence1", "sentence2")
96
99
  min_score: int = 0
97
100
  max_score: int = 5
101
+ input1_prompt_type: PromptType | None = None
102
+ input2_prompt_type: PromptType | None = None
98
103
 
99
104
  def _evaluate_subset(
100
105
  self,
@@ -115,6 +120,8 @@ class AbsTaskSTS(AbsTask):
115
120
  task_metadata=self.metadata,
116
121
  hf_split=hf_split,
117
122
  hf_subset=hf_subset,
123
+ input1_prompt_type=self.input1_prompt_type,
124
+ input2_prompt_type=self.input2_prompt_type,
118
125
  **kwargs,
119
126
  )
120
127
  scores = evaluator(model, encode_kwargs=encode_kwargs)
@@ -107,6 +107,7 @@ The domains follow the categories used in the [Universal Dependencies project](h
107
107
  SampleCreationMethod = Literal[
108
108
  "found",
109
109
  "created",
110
+ "created and machine-translated",
110
111
  "human-translated and localized",
111
112
  "human-translated",
112
113
  "machine-translated",
@@ -1,6 +1,6 @@
1
- import math
2
1
  import re
3
2
  from collections import defaultdict
3
+ from typing import Literal
4
4
 
5
5
  import numpy as np
6
6
  import pandas as pd
@@ -32,26 +32,18 @@ def _split_on_capital(s: str) -> str:
32
32
  return " ".join(re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", s))
33
33
 
34
34
 
35
- def _format_n_parameters(n_parameters) -> str:
36
- if (n_parameters is None) or (not int(n_parameters)):
37
- return "Unknown"
38
- n_thousand = int(n_parameters // 1e3)
39
- if n_thousand < 1:
40
- return str(int(n_parameters))
41
- n_zeros = math.log10(n_thousand)
42
- if n_zeros >= 6:
43
- return str(n_thousand // (10**6)) + "B"
44
- if n_zeros >= 3:
45
- return str(n_thousand // (10**3)) + "M"
46
- return str(n_thousand) + "K"
35
+ def _format_n_parameters(n_parameters) -> float | None:
36
+ """Format n_parameters to be in billions with decimals down to 1 million. I.e. 7M -> 0.007B, 1.5B -> 1.5B, None -> None"""
37
+ if n_parameters:
38
+ n_parameters = float(n_parameters)
39
+ return round(n_parameters / 1e9, 3)
40
+ return None
47
41
 
48
42
 
49
- def _format_max_tokens(max_tokens: float | None) -> str:
50
- if max_tokens is None:
51
- return "Unknown"
52
- if max_tokens == np.inf:
53
- return "Infinite"
54
- return str(int(max_tokens))
43
+ def _format_max_tokens(max_tokens: float | None) -> float | None:
44
+ if max_tokens is None or max_tokens == np.inf:
45
+ return None
46
+ return float(max_tokens)
55
47
 
56
48
 
57
49
  def _get_means_per_types(per_task: pd.DataFrame):
@@ -144,18 +136,18 @@ def _create_summary_table_from_benchmark_results(
144
136
  joint_table.insert(
145
137
  1,
146
138
  "Embedding Dimensions",
147
- model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"),
139
+ model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
148
140
  )
149
141
  joint_table.insert(
150
142
  1,
151
- "Number of Parameters",
143
+ "Number of Parameters (B)",
152
144
  model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
153
145
  )
154
146
  joint_table.insert(
155
147
  1,
156
148
  "Memory Usage (MB)",
157
149
  model_metas.map(
158
- lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown"
150
+ lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
159
151
  ),
160
152
  )
161
153
 
@@ -250,6 +242,65 @@ def _create_per_task_table_from_benchmark_results(
250
242
  return per_task
251
243
 
252
244
 
245
+ def _create_per_language_table_from_benchmark_results(
246
+ benchmark_results: BenchmarkResults,
247
+ language_view: list[str] | Literal["all"],
248
+ ) -> pd.DataFrame:
249
+ """Create per-language table from BenchmarkResults.
250
+
251
+ Returns a DataFrame with one row per model and one column per language.
252
+
253
+ Args:
254
+ benchmark_results: BenchmarkResults object containing model results
255
+ language_view: List of languages to include in the per-language table, or "all" for all languages present in the results
256
+ Returns:
257
+ DataFrame with per-language scores, ready for styling in the leaderboard
258
+ """
259
+ if language_view != "all" and not isinstance(language_view, list):
260
+ raise ValueError("language_view must be a list of languages or 'all'")
261
+
262
+ data = benchmark_results.to_dataframe(aggregation_level="language", format="long")
263
+
264
+ if data.empty:
265
+ no_results_frame = pd.DataFrame(
266
+ {"No results": ["You can try relaxing your criteria"]}
267
+ )
268
+ return no_results_frame
269
+
270
+ if language_view != "all":
271
+ data = data[data["language"].isin(language_view)]
272
+
273
+ per_language = data.pivot_table(
274
+ index="model_name", columns="language", values="score", aggfunc="mean"
275
+ )
276
+
277
+ to_remove = per_language.isna().all(axis="columns")
278
+ if to_remove.all():
279
+ no_results_frame = pd.DataFrame(
280
+ {"No results": ["You can try relaxing your criteria"]}
281
+ )
282
+ return no_results_frame
283
+
284
+ models_to_remove = list(per_language[to_remove].index)
285
+ per_language = per_language.drop(models_to_remove, axis=0)
286
+
287
+ per_language["borda_rank"] = _get_borda_rank(per_language)
288
+ per_language = per_language.sort_values("borda_rank", ascending=True)
289
+ per_language = per_language.drop(columns=["borda_rank"])
290
+ per_language = per_language.reset_index()
291
+
292
+ per_language["model_name"] = per_language["model_name"].map(
293
+ lambda name: name.split("/")[-1]
294
+ )
295
+ per_language = per_language.rename(
296
+ columns={
297
+ "model_name": "Model",
298
+ }
299
+ )
300
+
301
+ return per_language
302
+
303
+
253
304
  def _create_summary_table_mean_public_private(
254
305
  benchmark_results: BenchmarkResults,
255
306
  ) -> pd.DataFrame:
@@ -323,18 +374,18 @@ def _create_summary_table_mean_public_private(
323
374
  joint_table.insert(
324
375
  1,
325
376
  "Embedding Dimensions",
326
- model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"),
377
+ model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
327
378
  )
328
379
  joint_table.insert(
329
380
  1,
330
- "Number of Parameters",
381
+ "Number of Parameters (B)",
331
382
  model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
332
383
  )
333
384
  joint_table.insert(
334
385
  1,
335
386
  "Memory Usage (MB)",
336
387
  model_metas.map(
337
- lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown"
388
+ lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
338
389
  ),
339
390
  )
340
391
 
@@ -358,9 +409,7 @@ def _create_summary_table_mean_public_private(
358
409
  "mean(public)": "Mean (Public)",
359
410
  "mean(private)": "Mean (Private)",
360
411
  }
361
- # For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
362
- if "Retrieval" in joint_table.columns:
363
- rename_dict["Retrieval"] = "Mean (Task)"
412
+
364
413
  joint_table = joint_table.rename(columns=rename_dict)
365
414
 
366
415
  # Move borda rank to front
@@ -447,18 +496,18 @@ def _create_summary_table_mean_subset(
447
496
  joint_table.insert(
448
497
  1,
449
498
  "Embedding Dimensions",
450
- model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"),
499
+ model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
451
500
  )
452
501
  joint_table.insert(
453
502
  1,
454
- "Number of Parameters",
503
+ "Number of Parameters (B)",
455
504
  model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
456
505
  )
457
506
  joint_table.insert(
458
507
  1,
459
508
  "Memory Usage (MB)",
460
509
  model_metas.map(
461
- lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown"
510
+ lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
462
511
  ),
463
512
  )
464
513
 
@@ -560,25 +609,23 @@ def _create_summary_table_mean_task_type(
560
609
 
561
610
  # Insert model metadata columns
562
611
  joint_table.insert(
563
- 1,
564
- "Max Tokens",
565
- model_metas.map(lambda m: _format_max_tokens(m.max_tokens)),
612
+ 1, "Max Tokens", model_metas.map(lambda m: _format_max_tokens(m.max_tokens))
566
613
  )
567
614
  joint_table.insert(
568
615
  1,
569
616
  "Embedding Dimensions",
570
- model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"),
617
+ model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
571
618
  )
572
619
  joint_table.insert(
573
620
  1,
574
- "Number of Parameters",
621
+ "Number of Parameters (B)",
575
622
  model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
576
623
  )
577
624
  joint_table.insert(
578
625
  1,
579
626
  "Memory Usage (MB)",
580
627
  model_metas.map(
581
- lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown"
628
+ lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
582
629
  ),
583
630
  )
584
631
 
@@ -1,21 +1,16 @@
1
+ from __future__ import annotations
2
+
1
3
  from collections.abc import Iterable, Sequence
2
- from dataclasses import dataclass
3
- from typing import TYPE_CHECKING
4
+ from dataclasses import dataclass, field
5
+ from typing import TYPE_CHECKING, Literal
4
6
 
5
7
  import pandas as pd
6
8
 
7
- from mteb.benchmarks._create_table import (
8
- _create_per_task_table_from_benchmark_results,
9
- _create_summary_table_from_benchmark_results,
10
- _create_summary_table_mean_public_private,
11
- _create_summary_table_mean_subset,
12
- _create_summary_table_mean_task_type,
13
- )
14
- from mteb.results import BenchmarkResults
9
+ from mteb.abstasks.abstask import AbsTask
15
10
  from mteb.types import StrURL
16
11
 
17
12
  if TYPE_CHECKING:
18
- from mteb.abstasks import AbsTask
13
+ from mteb.results import BenchmarkResults
19
14
 
20
15
 
21
16
  @dataclass
@@ -42,7 +37,7 @@ class Benchmark:
42
37
  """
43
38
 
44
39
  name: str
45
- tasks: Sequence["AbsTask"]
40
+ tasks: Sequence[AbsTask]
46
41
  description: str | None = None
47
42
  reference: StrURL | None = None
48
43
  citation: str | None = None
@@ -50,14 +45,15 @@ class Benchmark:
50
45
  display_on_leaderboard: bool = True
51
46
  icon: str | None = None
52
47
  display_name: str | None = None
48
+ language_view: list[str] | Literal["all"] = field(default_factory=list)
53
49
 
54
- def __iter__(self) -> Iterable["AbsTask"]:
50
+ def __iter__(self) -> Iterable[AbsTask]:
55
51
  return iter(self.tasks)
56
52
 
57
53
  def __len__(self) -> int:
58
54
  return len(self.tasks)
59
55
 
60
- def __getitem__(self, index: int) -> "AbsTask":
56
+ def __getitem__(self, index: int) -> AbsTask:
61
57
  return self.tasks[index]
62
58
 
63
59
  def _create_summary_table(
@@ -68,6 +64,10 @@ class Benchmark:
68
64
  Returns:
69
65
  A pandas DataFrame representing the summary results.
70
66
  """
67
+ from mteb.benchmarks._create_table import (
68
+ _create_summary_table_from_benchmark_results,
69
+ )
70
+
71
71
  return _create_summary_table_from_benchmark_results(benchmark_results)
72
72
 
73
73
  def _create_per_task_table(
@@ -78,8 +78,38 @@ class Benchmark:
78
78
  Returns:
79
79
  A pandas DataFrame representing the per-task results.
80
80
  """
81
+ from mteb.benchmarks._create_table import (
82
+ _create_per_task_table_from_benchmark_results,
83
+ )
84
+
81
85
  return _create_per_task_table_from_benchmark_results(benchmark_results)
82
86
 
87
+ def _create_per_language_table(
88
+ self, benchmark_results: BenchmarkResults
89
+ ) -> pd.DataFrame:
90
+ """Create per-language table. Called by the leaderboard app.
91
+
92
+ Returns:
93
+ A pandas DataFrame representing the per-language results.
94
+ """
95
+ from mteb.benchmarks._create_table import (
96
+ _create_per_language_table_from_benchmark_results,
97
+ )
98
+
99
+ if self.language_view == "all" or len(self.language_view) > 0:
100
+ return _create_per_language_table_from_benchmark_results(
101
+ benchmark_results, self.language_view
102
+ )
103
+ else:
104
+ no_results_frame = pd.DataFrame(
105
+ {
106
+ "No results": [
107
+ "The per-language table is not available for this benchmark."
108
+ ]
109
+ }
110
+ )
111
+ return no_results_frame
112
+
83
113
 
84
114
  class RtebBenchmark(Benchmark):
85
115
  """Wrapper for RTEB benchmark."""
@@ -87,7 +117,14 @@ class RtebBenchmark(Benchmark):
87
117
  def _create_summary_table(
88
118
  self, benchmark_results: BenchmarkResults
89
119
  ) -> pd.DataFrame:
90
- return _create_summary_table_mean_public_private(benchmark_results)
120
+ from mteb.benchmarks._create_table import (
121
+ _create_summary_table_mean_public_private,
122
+ )
123
+
124
+ joint_table = _create_summary_table_mean_public_private(benchmark_results)
125
+ # For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
126
+ joint_table = joint_table.rename(columns={"Retrieval": "Mean (Task)"})
127
+ return joint_table
91
128
 
92
129
 
93
130
  class HUMEBenchmark(Benchmark):
@@ -96,6 +133,8 @@ class HUMEBenchmark(Benchmark):
96
133
  def _create_summary_table(
97
134
  self, benchmark_results: BenchmarkResults
98
135
  ) -> pd.DataFrame:
136
+ from mteb.benchmarks._create_table import _create_summary_table_mean_subset
137
+
99
138
  return _create_summary_table_mean_subset(benchmark_results)
100
139
 
101
140
 
@@ -105,4 +144,24 @@ class MIEBBenchmark(Benchmark):
105
144
  def _create_summary_table(
106
145
  self, benchmark_results: BenchmarkResults
107
146
  ) -> pd.DataFrame:
147
+ from mteb.benchmarks._create_table import _create_summary_table_mean_task_type
148
+
108
149
  return _create_summary_table_mean_task_type(benchmark_results)
150
+
151
+
152
+ class VidoreBenchmark(Benchmark):
153
+ """Wrapper for Vidore3 benchmark."""
154
+
155
+ def _create_summary_table(
156
+ self, benchmark_results: BenchmarkResults
157
+ ) -> pd.DataFrame:
158
+ from mteb.benchmarks._create_table import (
159
+ _create_summary_table_mean_public_private,
160
+ )
161
+
162
+ joint_table = _create_summary_table_mean_public_private(benchmark_results)
163
+ # For ViDoRe (V1, V2, V3): all tasks are Document Understanding type, so Document Understanding column = Mean (Task)
164
+ joint_table = joint_table.rename(
165
+ columns={"Document Understanding": "Mean (Task)"}
166
+ )
167
+ return joint_table