mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/_create_dataloaders.py +6 -3
  3. mteb/_evaluators/any_sts_evaluator.py +21 -12
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +1 -1
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
  7. mteb/_evaluators/pair_classification_evaluator.py +30 -38
  8. mteb/_evaluators/sklearn_evaluator.py +15 -28
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  10. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  12. mteb/abstasks/_data_filter/__init__.py +0 -0
  13. mteb/abstasks/_data_filter/filters.py +125 -0
  14. mteb/abstasks/_data_filter/task_pipelines.py +102 -0
  15. mteb/abstasks/_statistics_calculation.py +6 -2
  16. mteb/abstasks/classification.py +0 -2
  17. mteb/abstasks/clustering.py +1 -1
  18. mteb/abstasks/clustering_legacy.py +3 -0
  19. mteb/abstasks/multilabel_classification.py +10 -3
  20. mteb/abstasks/pair_classification.py +8 -1
  21. mteb/abstasks/sts.py +7 -0
  22. mteb/abstasks/task_metadata.py +1 -0
  23. mteb/benchmarks/_create_table.py +84 -37
  24. mteb/benchmarks/benchmark.py +74 -15
  25. mteb/benchmarks/benchmarks/__init__.py +8 -0
  26. mteb/benchmarks/benchmarks/benchmarks.py +259 -15
  27. mteb/benchmarks/get_benchmark.py +2 -0
  28. mteb/cache.py +47 -10
  29. mteb/deprecated_evaluator.py +8 -13
  30. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  31. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  32. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  33. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  34. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  35. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  36. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  37. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  38. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  39. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  40. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  41. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  42. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  43. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  44. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  45. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  46. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  47. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  48. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  49. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  50. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  51. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  53. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  54. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  55. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  56. mteb/evaluate.py +65 -45
  57. mteb/leaderboard/app.py +268 -133
  58. mteb/leaderboard/benchmark_selector.py +14 -5
  59. mteb/leaderboard/figures.py +13 -15
  60. mteb/leaderboard/table.py +82 -17
  61. mteb/models/__init__.py +4 -1
  62. mteb/models/abs_encoder.py +21 -17
  63. mteb/models/cache_wrappers/__init__.py +2 -1
  64. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
  65. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  66. mteb/models/get_model_meta.py +3 -114
  67. mteb/models/instruct_wrapper.py +5 -1
  68. mteb/models/model_implementations/align_models.py +7 -0
  69. mteb/models/model_implementations/amazon_models.py +1 -0
  70. mteb/models/model_implementations/andersborges.py +65 -0
  71. mteb/models/model_implementations/ara_models.py +8 -0
  72. mteb/models/model_implementations/arctic_models.py +8 -0
  73. mteb/models/model_implementations/b1ade_models.py +1 -0
  74. mteb/models/model_implementations/bedrock_models.py +4 -0
  75. mteb/models/model_implementations/bge_models.py +60 -0
  76. mteb/models/model_implementations/bica_model.py +35 -0
  77. mteb/models/model_implementations/blip2_models.py +11 -0
  78. mteb/models/model_implementations/blip_models.py +27 -0
  79. mteb/models/model_implementations/bm25.py +1 -0
  80. mteb/models/model_implementations/bmretriever_models.py +4 -0
  81. mteb/models/model_implementations/cadet_models.py +9 -0
  82. mteb/models/model_implementations/cde_models.py +14 -0
  83. mteb/models/model_implementations/clip_models.py +3 -0
  84. mteb/models/model_implementations/clips_models.py +100 -0
  85. mteb/models/model_implementations/codefuse_models.py +162 -0
  86. mteb/models/model_implementations/codesage_models.py +15 -0
  87. mteb/models/model_implementations/cohere_models.py +8 -1
  88. mteb/models/model_implementations/cohere_v.py +5 -0
  89. mteb/models/model_implementations/colpali_models.py +14 -6
  90. mteb/models/model_implementations/colqwen_models.py +271 -1
  91. mteb/models/model_implementations/colsmol_models.py +2 -0
  92. mteb/models/model_implementations/conan_models.py +1 -0
  93. mteb/models/model_implementations/dino_models.py +171 -0
  94. mteb/models/model_implementations/e5_instruct.py +4 -0
  95. mteb/models/model_implementations/e5_models.py +12 -101
  96. mteb/models/model_implementations/e5_v.py +1 -0
  97. mteb/models/model_implementations/eagerworks_models.py +164 -0
  98. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  99. mteb/models/model_implementations/en_code_retriever.py +1 -0
  100. mteb/models/model_implementations/euler_models.py +32 -0
  101. mteb/models/model_implementations/evaclip_models.py +4 -0
  102. mteb/models/model_implementations/fa_models.py +58 -0
  103. mteb/models/model_implementations/facebookai.py +193 -0
  104. mteb/models/model_implementations/geogpt_models.py +1 -0
  105. mteb/models/model_implementations/gme_v_models.py +11 -5
  106. mteb/models/model_implementations/google_models.py +16 -5
  107. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
  108. mteb/models/model_implementations/gritlm_models.py +2 -0
  109. mteb/models/model_implementations/gte_models.py +78 -0
  110. mteb/models/model_implementations/hinvec_models.py +1 -0
  111. mteb/models/model_implementations/human.py +1 -0
  112. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  113. mteb/models/model_implementations/inf_models.py +2 -0
  114. mteb/models/model_implementations/jasper_models.py +255 -2
  115. mteb/models/model_implementations/jina_clip.py +1 -0
  116. mteb/models/model_implementations/jina_models.py +209 -5
  117. mteb/models/model_implementations/kalm_models.py +203 -25
  118. mteb/models/model_implementations/kblab.py +31 -0
  119. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  120. mteb/models/model_implementations/kfst.py +25 -0
  121. mteb/models/model_implementations/kowshik24_models.py +32 -0
  122. mteb/models/model_implementations/lens_models.py +2 -0
  123. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  124. mteb/models/model_implementations/linq_models.py +3 -2
  125. mteb/models/model_implementations/listconranker.py +1 -1
  126. mteb/models/model_implementations/llm2clip_models.py +3 -0
  127. mteb/models/model_implementations/llm2vec_models.py +8 -0
  128. mteb/models/model_implementations/mcinext_models.py +3 -0
  129. mteb/models/model_implementations/mdbr_models.py +2 -0
  130. mteb/models/model_implementations/misc_models.py +362 -0
  131. mteb/models/model_implementations/mme5_models.py +1 -0
  132. mteb/models/model_implementations/moco_models.py +11 -0
  133. mteb/models/model_implementations/mod_models.py +191 -0
  134. mteb/models/model_implementations/model2vec_models.py +13 -0
  135. mteb/models/model_implementations/moka_models.py +3 -0
  136. mteb/models/model_implementations/mxbai_models.py +9 -0
  137. mteb/models/model_implementations/nbailab.py +70 -0
  138. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  139. mteb/models/model_implementations/nomic_models.py +156 -4
  140. mteb/models/model_implementations/nomic_models_vision.py +7 -2
  141. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
  142. mteb/models/model_implementations/nvidia_models.py +4 -1
  143. mteb/models/model_implementations/octen_models.py +195 -0
  144. mteb/models/model_implementations/openai_models.py +20 -16
  145. mteb/models/model_implementations/openclip_models.py +24 -0
  146. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  147. mteb/models/model_implementations/ops_moa_models.py +4 -2
  148. mteb/models/model_implementations/pawan_models.py +39 -0
  149. mteb/models/model_implementations/piccolo_models.py +8 -0
  150. mteb/models/model_implementations/promptriever_models.py +8 -4
  151. mteb/models/model_implementations/pylate_models.py +37 -4
  152. mteb/models/model_implementations/qodo_models.py +2 -0
  153. mteb/models/model_implementations/qtack_models.py +1 -0
  154. mteb/models/model_implementations/qwen3_models.py +6 -3
  155. mteb/models/model_implementations/qzhou_models.py +3 -1
  156. mteb/models/model_implementations/random_baseline.py +16 -21
  157. mteb/models/model_implementations/rasgaard_models.py +34 -0
  158. mteb/models/model_implementations/reasonir_model.py +1 -0
  159. mteb/models/model_implementations/repllama_models.py +2 -0
  160. mteb/models/model_implementations/rerankers_custom.py +3 -3
  161. mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
  162. mteb/models/model_implementations/richinfoai_models.py +1 -0
  163. mteb/models/model_implementations/ru_sentence_models.py +51 -0
  164. mteb/models/model_implementations/ruri_models.py +322 -0
  165. mteb/models/model_implementations/salesforce_models.py +3 -0
  166. mteb/models/model_implementations/samilpwc_models.py +1 -0
  167. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  168. mteb/models/model_implementations/searchmap_models.py +1 -0
  169. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  170. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
  171. mteb/models/model_implementations/seed_models.py +1 -0
  172. mteb/models/model_implementations/sentence_transformers_models.py +57 -0
  173. mteb/models/model_implementations/shuu_model.py +32 -31
  174. mteb/models/model_implementations/siglip_models.py +10 -0
  175. mteb/models/model_implementations/sonar_models.py +1 -0
  176. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  177. mteb/models/model_implementations/stella_models.py +6 -0
  178. mteb/models/model_implementations/tarka_models.py +376 -0
  179. mteb/models/model_implementations/ua_sentence_models.py +10 -0
  180. mteb/models/model_implementations/uae_models.py +1 -0
  181. mteb/models/model_implementations/vdr_models.py +2 -0
  182. mteb/models/model_implementations/vi_vn_models.py +39 -0
  183. mteb/models/model_implementations/vista_models.py +2 -0
  184. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  185. mteb/models/model_implementations/voyage_models.py +15 -0
  186. mteb/models/model_implementations/voyage_v.py +8 -2
  187. mteb/models/model_implementations/xyz_models.py +1 -0
  188. mteb/models/model_implementations/youtu_models.py +1 -0
  189. mteb/models/model_implementations/yuan_models.py +34 -0
  190. mteb/models/model_implementations/yuan_models_en.py +58 -0
  191. mteb/models/model_meta.py +442 -22
  192. mteb/models/search_encoder_index/__init__.py +7 -0
  193. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  194. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  195. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  196. mteb/models/search_wrappers.py +165 -48
  197. mteb/models/sentence_transformer_wrapper.py +2 -7
  198. mteb/results/benchmark_results.py +88 -47
  199. mteb/results/model_result.py +11 -4
  200. mteb/results/task_result.py +37 -19
  201. mteb/similarity_functions.py +49 -0
  202. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  203. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  204. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  205. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  206. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  207. mteb/tasks/classification/ara/ajgt.py +1 -2
  208. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  209. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  210. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  211. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  212. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  213. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  214. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  215. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  216. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  217. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  218. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  219. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  220. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  221. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  222. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  223. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  224. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  225. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  226. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  227. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  228. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  229. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  230. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  231. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  232. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  233. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  234. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  235. mteb/tasks/classification/eng/news_classification.py +1 -2
  236. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  237. mteb/tasks/classification/eng/patent_classification.py +1 -2
  238. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  239. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  240. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  241. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  242. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  243. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  244. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  245. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  246. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  247. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  248. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  249. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  250. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  251. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  252. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  253. mteb/tasks/classification/est/estonian_valence.py +1 -2
  254. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  255. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  257. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  260. mteb/tasks/classification/heb/__init__.py +6 -1
  261. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  262. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  263. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  264. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  265. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  266. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  267. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  268. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  269. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  270. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  271. mteb/tasks/classification/kor/klue_tc.py +1 -2
  272. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  274. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  275. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  276. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  277. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  278. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  279. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  280. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  281. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  282. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  283. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  284. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  285. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  286. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  287. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  288. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  289. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  290. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  291. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  292. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  293. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  294. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  295. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  296. mteb/tasks/classification/pol/polish_classification.py +3 -6
  297. mteb/tasks/classification/ron/moroco.py +1 -2
  298. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  299. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  300. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  301. mteb/tasks/classification/rus/headline_classification.py +1 -2
  302. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  303. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  304. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  305. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  306. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  307. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  308. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  309. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  310. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  311. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  312. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  313. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  314. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  315. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  316. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  317. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  318. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  319. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  320. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  321. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  322. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  323. mteb/tasks/classification/tur/__init__.py +4 -0
  324. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  325. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  326. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  327. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  328. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  329. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  330. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  331. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  332. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  333. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  334. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  335. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  336. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  337. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  338. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  339. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  340. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  341. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  342. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  343. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  344. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  345. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  346. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  347. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  348. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  349. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  350. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  351. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  352. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  353. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  354. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  355. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  356. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  357. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  358. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  359. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  360. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  361. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  362. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  363. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  364. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  365. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  366. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  367. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  368. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  369. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  370. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  371. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  372. mteb/tasks/pair_classification/rus/terra.py +51 -25
  373. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  374. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  375. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  376. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  377. mteb/tasks/reranking/jpn/__init__.py +9 -1
  378. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  379. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  380. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  381. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  382. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  383. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  384. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  385. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  386. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  387. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  388. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  389. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  390. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  391. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  392. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  393. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  394. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  395. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  396. mteb/tasks/retrieval/kor/__init__.py +2 -1
  397. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  398. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  399. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  400. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  401. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  402. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  403. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  404. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  405. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  406. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  407. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  408. mteb/tasks/retrieval/nld/__init__.py +8 -4
  409. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  410. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  411. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  412. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  413. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  414. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  415. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  416. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  417. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  418. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  419. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  420. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  421. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  422. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  423. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  424. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  425. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  426. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  427. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  428. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  429. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  430. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  431. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  432. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  433. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  434. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  435. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  436. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  437. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  438. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  439. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  440. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  441. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  442. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  443. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  444. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  445. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  446. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  447. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  448. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  449. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  450. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  451. mteb/types/_encoder_io.py +7 -2
  452. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
  453. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
  454. mteb/models/model_implementations/nb_sbert.py +0 -25
  455. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
  456. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
  457. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
  458. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import functools
1
2
  import json
2
3
  import logging
3
4
  import warnings
@@ -15,6 +16,7 @@ from mteb.abstasks.task_metadata import (
15
16
  TaskDomain,
16
17
  TaskType,
17
18
  )
19
+ from mteb.benchmarks.benchmark import Benchmark
18
20
  from mteb.models import ModelMeta
19
21
  from mteb.models.get_model_meta import get_model_metas
20
22
  from mteb.types import (
@@ -31,6 +33,24 @@ from .model_result import ModelResult, _aggregate_and_pivot
31
33
  logger = logging.getLogger(__name__)
32
34
 
33
35
 
36
+ # Global cache for model metas and version parsing
37
+ @functools.lru_cache
38
+ def _get_cached_model_metas() -> dict[str, str | None]:
39
+ """Cache model metas to avoid repeated calls."""
40
+ return {meta.name: meta.revision for meta in get_model_metas()}
41
+
42
+
43
+ @functools.lru_cache(maxsize=10000)
44
+ def _parse_version_cached(version_str: str | None) -> Version | None:
45
+ """Cache version parsing to avoid repeated parsing."""
46
+ if version_str is None:
47
+ return None
48
+ try:
49
+ return Version(version_str)
50
+ except (InvalidVersion, TypeError):
51
+ return None
52
+
53
+
34
54
  class BenchmarkResults(BaseModel):
35
55
  """Data class to hold the benchmark results of a model.
36
56
 
@@ -39,10 +59,10 @@ class BenchmarkResults(BaseModel):
39
59
  """
40
60
 
41
61
  model_results: list[ModelResult]
42
- model_config = (
43
- ConfigDict( # to free up the name model_results which is otherwise protected
44
- protected_namespaces=(),
45
- )
62
+ benchmark: Benchmark | None = None
63
+ model_config = ConfigDict(
64
+ protected_namespaces=(), # to free up the name model_results which is otherwise protected
65
+ arbitrary_types_allowed=True, # Benchmark is dataclasses.dataclass
46
66
  )
47
67
 
48
68
  def __repr__(self) -> str:
@@ -173,40 +193,6 @@ class BenchmarkResults(BaseModel):
173
193
  Returns:
174
194
  A new BenchmarkResults object with the revisions joined.
175
195
  """
176
-
177
- def parse_version(version_str: str) -> Version | None:
178
- try:
179
- return Version(version_str)
180
- except (InvalidVersion, TypeError):
181
- return None
182
-
183
- def keep_best(group: pd.DataFrame) -> pd.DataFrame:
184
- # Filtering out task_results where no scores are present
185
- group = group[group["has_scores"]]
186
- is_main_revision = group["revision"] == group["main_revision"]
187
- # If the main revision is present we select that
188
- if is_main_revision.sum() > 0:
189
- return group[is_main_revision].head(n=1)
190
- unique_revisions = group["revision"].unique()
191
-
192
- # ensure None/NA/"external" revisions is filtered out
193
- group.loc[group["revision"].isna(), "revision"] = "no_revision_available"
194
- group.loc[group["revision"] == "external", "revision"] = (
195
- "no_revision_available"
196
- )
197
-
198
- # Filtering out no_revision_available if other revisions are present
199
- if (len(unique_revisions) > 1) and (
200
- "no_revision_available" in unique_revisions
201
- ):
202
- group = group[group["revision"] != "no_revision_available"]
203
- # If there are any not-NA mteb versions, we select the latest one
204
- if group["mteb_version"].notna().any():
205
- group = group.dropna(subset=["mteb_version"])
206
- group = group.sort_values("mteb_version", ascending=False)
207
- return group.head(n=1)
208
- return group.head(n=1)
209
-
210
196
  records = []
211
197
  for model_result in self:
212
198
  for task_result in model_result.task_results:
@@ -223,17 +209,54 @@ class BenchmarkResults(BaseModel):
223
209
  if not records:
224
210
  return BenchmarkResults.model_construct(model_results=[])
225
211
  task_df = pd.DataFrame.from_records(records)
226
- model_to_main_revision = {
227
- meta.name: meta.revision for meta in get_model_metas()
228
- }
212
+
213
+ # Use cached model metas
214
+ model_to_main_revision = _get_cached_model_metas()
229
215
  task_df["main_revision"] = task_df["model"].map(model_to_main_revision) # type: ignore
230
- task_df["mteb_version"] = task_df["mteb_version"].map(parse_version) # type: ignore
231
- task_df = (
232
- task_df.groupby(["model", "task_name"])
233
- .apply(keep_best)
234
- .reset_index(drop=True)
216
+
217
+ # Use cached version parsing
218
+ task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached) # type: ignore
219
+
220
+ # Filter out rows without scores first
221
+ task_df = task_df[task_df["has_scores"]]
222
+
223
+ # Optimize groupby with vectorized operations
224
+ # Sort by priority: main_revision match, then mteb_version (descending), then revision
225
+ task_df["is_main_revision"] = task_df["revision"] == task_df["main_revision"]
226
+
227
+ # Handle None/NA/external revisions
228
+ task_df["revision_clean"] = task_df["revision"].copy()
229
+ task_df.loc[task_df["revision"].isna(), "revision_clean"] = (
230
+ "no_revision_available"
235
231
  )
232
+ task_df.loc[task_df["revision"] == "external", "revision_clean"] = (
233
+ "no_revision_available"
234
+ )
235
+
236
+ # Create a priority column for sorting
237
+ # Higher priority = better to keep
238
+ # Priority: main_revision (1000), has valid mteb_version (100), has valid revision (10)
239
+ task_df["priority"] = 0
240
+ task_df.loc[task_df["is_main_revision"], "priority"] += 1000
241
+ task_df.loc[task_df["mteb_version"].notna(), "priority"] += 100
242
+ task_df.loc[
243
+ task_df["revision_clean"] != "no_revision_available", "priority"
244
+ ] += 10
245
+
246
+ # Sort by priority (desc), mteb_version (desc), and take first per group
247
+ task_df = task_df.sort_values(
248
+ ["model", "task_name", "priority", "mteb_version"],
249
+ ascending=[True, True, False, False],
250
+ na_position="last",
251
+ )
252
+
253
+ task_df = task_df.groupby(["model", "task_name"], as_index=False).first()
254
+
255
+ # Reconstruct model results
236
256
  model_results = []
257
+ # Group by original revision to maintain deterministic behavior
258
+ # After the first() selection above, each (model, task_name) is unique,
259
+ # so grouping by original revision ensures consistent ModelResult creation
237
260
  for (model, model_revision), group in task_df.groupby(["model", "revision"]):
238
261
  model_result = ModelResult.model_construct(
239
262
  model_name=model,
@@ -296,7 +319,7 @@ class BenchmarkResults(BaseModel):
296
319
 
297
320
  def to_dataframe(
298
321
  self,
299
- aggregation_level: Literal["subset", "split", "task"] = "task",
322
+ aggregation_level: Literal["subset", "split", "task", "language"] = "task",
300
323
  aggregation_fn: Callable[[list[Score]], Any] | None = None,
301
324
  include_model_revision: bool = False,
302
325
  format: Literal["wide", "long"] = "wide",
@@ -321,6 +344,7 @@ class BenchmarkResults(BaseModel):
321
344
  - "subset"/None: No aggregation will be done. The DataFrame will have one row per model, task, split and subset.
322
345
  - "split": Aggregates the scores by split. The DataFrame will have one row per model, task and split.
323
346
  - "task": Aggregates the scores by task. The DataFrame will have one row per model and task.
347
+ - "language": Aggregates the scores by language. The DataFrame will have one row per model and language.
324
348
  aggregation_fn: The function to use for aggregation. If None, the mean will be used.
325
349
  include_model_revision: If True, the model revision will be included in the DataFrame. If False, it will be excluded.
326
350
  If there are multiple revisions for the same model, they will be joined using the `join_revisions` method.
@@ -361,6 +385,23 @@ class BenchmarkResults(BaseModel):
361
385
  format=format,
362
386
  )
363
387
 
388
+ def get_benchmark_result(self) -> pd.DataFrame:
389
+ """Get aggregated scores for each model in the benchmark.
390
+
391
+ Uses the benchmark's summary table creation method to compute scores.
392
+
393
+ Returns:
394
+ A DataFrame with the aggregated benchmark scores for each model.
395
+ """
396
+ if self.benchmark is None:
397
+ raise ValueError(
398
+ "No benchmark associated with these results (self.benchmark is None). "
399
+ "To get benchmark results, load results with a Benchmark object. "
400
+ "`results = cache.load_results(tasks='MTEB(eng, v2)')`"
401
+ )
402
+
403
+ return self.benchmark._create_summary_table(self)
404
+
364
405
  def __iter__(self) -> Iterator[ModelResult]:
365
406
  return iter(self.model_results)
366
407
 
@@ -22,7 +22,7 @@ from mteb.types import (
22
22
  SplitName,
23
23
  )
24
24
 
25
- from .task_result import TaskResult
25
+ from .task_result import TaskError, TaskResult
26
26
 
27
27
  logger = logging.getLogger(__name__)
28
28
 
@@ -30,7 +30,7 @@ logger = logging.getLogger(__name__)
30
30
  def _aggregate_and_pivot(
31
31
  df: pd.DataFrame,
32
32
  columns: list[str],
33
- aggregation_level: Literal["subset", "split", "task"],
33
+ aggregation_level: Literal["subset", "split", "task", "language"],
34
34
  format: Literal["wide", "long"],
35
35
  aggregation_fn: Callable[[list[Score]], Any] | None,
36
36
  ) -> pd.DataFrame:
@@ -43,6 +43,12 @@ def _aggregate_and_pivot(
43
43
  elif aggregation_level == "task":
44
44
  index_columns = ["task_name"]
45
45
 
46
+ elif aggregation_level == "language":
47
+ index_columns = ["language"]
48
+ df = df.explode("language").reset_index(
49
+ drop=True
50
+ ) # each language in its own row before aggregation
51
+
46
52
  # perform aggregation
47
53
  if aggregation_fn is None:
48
54
  aggregation_fn = np.mean
@@ -82,6 +88,7 @@ class ModelResult(BaseModel):
82
88
  protected_namespaces=(),
83
89
  )
84
90
  )
91
+ exceptions: list[TaskError] | None = None
85
92
 
86
93
  def __repr__(self) -> str:
87
94
  n_entries = len(self.task_results)
@@ -226,7 +233,7 @@ class ModelResult(BaseModel):
226
233
  )
227
234
  return entries
228
235
 
229
- def _get_score_for_table(self) -> list[dict[str, str | float]]:
236
+ def _get_score_for_table(self) -> list[dict[str, str | float | list[str]]]:
230
237
  scores_data = []
231
238
  model_name = self.model_name
232
239
  for task_result in self.task_results:
@@ -238,10 +245,10 @@ class ModelResult(BaseModel):
238
245
  "model_revision": self.model_revision,
239
246
  "task_name": task_name,
240
247
  "split": split,
248
+ "language": score_item.get("languages", ["Unknown"]),
241
249
  "subset": score_item.get("hf_subset", "default"),
242
250
  "score": score_item.get("main_score", None),
243
251
  }
244
-
245
252
  scores_data.append(row)
246
253
 
247
254
  return scores_data
@@ -633,21 +633,23 @@ class TaskResult(BaseModel):
633
633
  task = get_task(self.task_name)
634
634
 
635
635
  splits = task.eval_splits
636
- hf_subsets = task.hf_subsets
637
- hf_subsets = set(hf_subsets)
636
+ hf_subsets = set(task.hf_subsets) # Convert to set once
638
637
 
639
638
  new_scores = {}
640
639
  seen_splits = set()
641
640
  for split in self.scores:
642
641
  if split not in splits:
643
642
  continue
644
- new_scores[split] = []
645
643
  seen_subsets = set()
646
- for _scores in self.scores[split]:
647
- if _scores["hf_subset"] not in hf_subsets:
648
- continue
649
- new_scores[split].append(_scores)
644
+ # Use list comprehension for better performance
645
+ new_scores[split] = [
646
+ _scores
647
+ for _scores in self.scores[split]
648
+ if _scores["hf_subset"] in hf_subsets
649
+ ]
650
+ for _scores in new_scores[split]:
650
651
  seen_subsets.add(_scores["hf_subset"])
652
+
651
653
  if seen_subsets != hf_subsets:
652
654
  missing_subsets = hf_subsets - seen_subsets
653
655
  if len(missing_subsets) > 2:
@@ -664,9 +666,9 @@ class TaskResult(BaseModel):
664
666
  logger.warning(
665
667
  f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
666
668
  )
667
- new_res = {**self.to_dict(), "scores": new_scores}
668
- new_res = TaskResult.from_validated(**new_res)
669
- return new_res
669
+ data = self.model_dump()
670
+ data["scores"] = new_scores
671
+ return type(self).model_construct(**data)
670
672
 
671
673
  def is_mergeable(
672
674
  self,
@@ -698,27 +700,31 @@ class TaskResult(BaseModel):
698
700
  name = result.metadata.name
699
701
  revision = result.metadata.revision
700
702
  else:
703
+ msg = "result must be a TaskResult or AbsTask object"
704
+ if raise_error:
705
+ raise ValueError(msg)
706
+ logger.debug(msg)
701
707
  return False
702
708
 
703
709
  if self.task_name != name:
710
+ msg = f"Cannot merge TaskResult objects as they are derived from different tasks ({self.task_name} and {name})"
704
711
  if raise_error:
705
- raise ValueError(
706
- f"Cannot merge TaskResult objects as they are derived from different tasks ({self.task_name} and {name})"
707
- )
712
+ raise ValueError(msg)
713
+ logger.debug(msg)
708
714
  return False
709
715
 
710
716
  if Criteria.MTEB_VERSION in criteria and self.mteb_version != mteb_version:
717
+ msg = f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} (loaded) and {mteb_version} (current))"
711
718
  if raise_error:
712
- raise ValueError(
713
- f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} and {mteb_version})"
714
- )
719
+ raise ValueError(msg)
720
+ logger.debug(msg)
715
721
  return False
716
722
 
717
723
  if Criteria.DATASET_REVISION in criteria and self.dataset_revision != revision:
724
+ msg = f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
718
725
  if raise_error:
719
- raise ValueError(
720
- f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
721
- )
726
+ raise ValueError(msg)
727
+ logger.debug(msg)
722
728
  return False
723
729
 
724
730
  return True
@@ -836,3 +842,15 @@ class TaskResult(BaseModel):
836
842
  )
837
843
  )
838
844
  return results
845
+
846
+
847
+ class TaskError(BaseModel):
848
+ """A class to represent an error that occurred during the evaluation of a task.
849
+
850
+ Attributes:
851
+ task_name: The name of the MTEB task.
852
+ exception: The error message that occurred during the evaluation.
853
+ """
854
+
855
+ task_name: str
856
+ exception: str
@@ -1,6 +1,7 @@
1
1
  import torch
2
2
 
3
3
  from mteb.models import EncoderProtocol
4
+ from mteb.models.model_meta import ScoringFunction
4
5
  from mteb.types import Array
5
6
 
6
7
 
@@ -38,6 +39,54 @@ def compute_pairwise_similarity(
38
39
  return pairwise_cos_sim(embedding1, embedding2)
39
40
 
40
41
 
42
+ def select_similarity(
43
+ embedding1: Array,
44
+ embedding2: Array,
45
+ similarity_fn: ScoringFunction,
46
+ ) -> Array:
47
+ """Compute similarity between two sets of embeddings using the specified similarity function.
48
+
49
+ Args:
50
+ embedding1: The first set of embeddings.
51
+ embedding2: The second set of embeddings.
52
+ similarity_fn: The similarity function to use (COSINE, DOT_PRODUCT, EUCLIDEAN).
53
+
54
+ Returns:
55
+ Array: The computed similarity scores.
56
+ """
57
+ if similarity_fn is ScoringFunction.COSINE:
58
+ return cos_sim(embedding1, embedding2)
59
+ elif similarity_fn is ScoringFunction.DOT_PRODUCT:
60
+ return dot_score(embedding1, embedding2)
61
+ elif similarity_fn is ScoringFunction.EUCLIDEAN:
62
+ return euclidean_sim(embedding1, embedding2)
63
+ raise ValueError(f"Unsupported similarity function: {similarity_fn}")
64
+
65
+
66
+ def select_pairwise_similarity(
67
+ embedding1: Array,
68
+ embedding2: Array,
69
+ similarity_fn: ScoringFunction,
70
+ ) -> Array:
71
+ """Compute pairwise similarity between two sets of embeddings using the specified similarity function.
72
+
73
+ Args:
74
+ embedding1: The first set of embeddings.
75
+ embedding2: The second set of embeddings.
76
+ similarity_fn: The similarity function to use (COSINE, DOT_PRODUCT, EUCLIDEAN).
77
+
78
+ Returns:
79
+ Array: The computed pairwise similarity scores.
80
+ """
81
+ if similarity_fn is ScoringFunction.COSINE:
82
+ return pairwise_cos_sim(embedding1, embedding2)
83
+ elif similarity_fn is ScoringFunction.DOT_PRODUCT:
84
+ return pairwise_dot_score(embedding1, embedding2)
85
+ elif similarity_fn is ScoringFunction.EUCLIDEAN:
86
+ return pairwise_euclidean_sim(embedding1, embedding2)
87
+ raise ValueError(f"Unsupported similarity function: {similarity_fn}")
88
+
89
+
41
90
  def _normalize_embeddings(embeddings: Array) -> torch.Tensor:
42
91
  """Normalizes the embeddings matrix, so that each sentence embedding has unit length.
43
92
 
@@ -16,7 +16,7 @@ from .nusa_translation_bitext_mining import NusaTranslationBitextMining
16
16
  from .nusa_x_bitext_mining import NusaXBitextMining
17
17
  from .phinc_bitext_mining import PhincBitextMining
18
18
  from .roma_tales_bitext_mining import RomaTalesBitextMining
19
- from .ru_sci_bench_bitext_mining import RuSciBenchBitextMining
19
+ from .ru_sci_bench_bitext_mining import RuSciBenchBitextMining, RuSciBenchBitextMiningV2
20
20
  from .tatoeba_bitext_mining import TatoebaBitextMining
21
21
  from .web_faq_bitext_mining import WebFAQBitextMiningQAs, WebFAQBitextMiningQuestions
22
22
 
@@ -40,6 +40,7 @@ __all__ = [
40
40
  "PhincBitextMining",
41
41
  "RomaTalesBitextMining",
42
42
  "RuSciBenchBitextMining",
43
+ "RuSciBenchBitextMiningV2",
43
44
  "TatoebaBitextMining",
44
45
  "WebFAQBitextMiningQAs",
45
46
  "WebFAQBitextMiningQuestions",
@@ -23,7 +23,7 @@ class BUCCBitextMining(AbsTaskBitextMining):
23
23
  "path": "mteb/BUCC",
24
24
  "revision": "414572247440f0ccacf7eb0bb70a31533a0e5443",
25
25
  },
26
- description="BUCC bitext mining dataset",
26
+ description="BUCC bitext mining dataset train split.",
27
27
  reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
28
28
  type="BitextMining",
29
29
  category="t2t",
@@ -71,7 +71,9 @@ Rapp, Reinhard},
71
71
 
72
72
  sentence1 = data["sentence1"][0]
73
73
  sentence2 = data["sentence2"][0]
74
- sentence1 = [sentence1[i] for (i, j) in gold]
74
+ sentence1 = [
75
+ sentence1[i] for (i, j) in gold
76
+ ] # keep only sentences in gold. The 2nd value is meant for sentence2 but not used here. This is fixed in BUCC.v2.
75
77
  logger.info(f"Lang {lang} num gold {len(gold)}")
76
78
  logger.info(f"Lang {lang} num sentence1 {len(sentence1)}")
77
79
  logger.info(f"Lang {lang} num sentence2 {len(sentence2)}")
@@ -20,7 +20,7 @@ class BUCCBitextMiningFast(AbsTaskBitextMining):
20
20
  "path": "mteb/bucc-bitext-mining",
21
21
  "revision": "1739dc11ffe9b7bfccd7f3d585aeb4c544fc6677",
22
22
  },
23
- description="BUCC bitext mining dataset",
23
+ description="BUCC bitext mining dataset train split, gold set only.",
24
24
  reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
25
25
  type="BitextMining",
26
26
  category="t2t",
@@ -10,11 +10,53 @@ class RuSciBenchBitextMining(AbsTaskBitextMining):
10
10
  "path": "mlsa-iai-msu-lab/ru_sci_bench_bitext_mining",
11
11
  "revision": "e5840033c5cf2573932db027ac8001fe0a7eb6fa",
12
12
  },
13
- description="""This task focuses on finding translations of scientific articles.
14
- The dataset is sourced from eLibrary, Russia's largest electronic library of scientific publications.
15
- Russian authors often provide English translations for their abstracts and titles,
16
- and the data consists of these paired titles and abstracts. The task evaluates a model's ability
17
- to match an article's Russian title and abstract to its English counterpart, or vice versa.""",
13
+ description="This task focuses on finding translations of scientific articles. The dataset is sourced from eLibrary, Russia's largest electronic library of scientific publications. Russian authors often provide English translations for their abstracts and titles, and the data consists of these paired titles and abstracts. The task evaluates a model's ability to match an article's Russian title and abstract to its English counterpart, or vice versa.",
14
+ reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb",
15
+ type="BitextMining",
16
+ category="t2c",
17
+ modalities=["text"],
18
+ eval_splits=["test"],
19
+ eval_langs={
20
+ "ru-en": ["rus-Cyrl", "eng-Latn"],
21
+ "en-ru": ["eng-Latn", "rus-Cyrl"],
22
+ },
23
+ main_score="f1",
24
+ date=("2007-01-01", "2023-01-01"),
25
+ domains=["Academic", "Non-fiction", "Written"],
26
+ task_subtypes=[],
27
+ license="not specified",
28
+ dialect=[],
29
+ sample_creation="found",
30
+ annotations_creators="derived",
31
+ bibtex_citation=r"""
32
+ @article{vatolin2024ruscibench,
33
+ author = {Vatolin, A. and Gerasimenko, N. and Ianina, A. and Vorontsov, K.},
34
+ doi = {10.1134/S1064562424602191},
35
+ issn = {1531-8362},
36
+ journal = {Doklady Mathematics},
37
+ month = {12},
38
+ number = {1},
39
+ pages = {S251--S260},
40
+ title = {RuSciBench: Open Benchmark for Russian and English Scientific Document Representations},
41
+ url = {https://doi.org/10.1134/S1064562424602191},
42
+ volume = {110},
43
+ year = {2024},
44
+ }
45
+ """,
46
+ prompt="Given the following title and abstract of the scientific article, find its translation",
47
+ superseded_by="RuSciBenchBitextMining.v2",
48
+ )
49
+
50
+
51
+ class RuSciBenchBitextMiningV2(AbsTaskBitextMining):
52
+ fast_loading = True
53
+ metadata = TaskMetadata(
54
+ name="RuSciBenchBitextMining.v2",
55
+ dataset={
56
+ "path": "mlsa-iai-msu-lab/ru_sci_bench_bitext_mining",
57
+ "revision": "20e815e8ac8787331546386dfd177821510f79a3",
58
+ },
59
+ description="This task focuses on finding translations of scientific articles. The dataset is sourced from eLibrary, Russia's largest electronic library of scientific publications. Russian authors often provide English translations for their abstracts and titles, and the data consists of these paired titles and abstracts. The task evaluates a model's ability to match an article's Russian title and abstract to its English counterpart, or vice versa. Compared to the previous version, 6 erroneous examples have been removed.",
18
60
  reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb",
19
61
  type="BitextMining",
20
62
  category="t2c",
@@ -198,9 +198,7 @@ _SPLITS = ["default"]
198
198
  class WebFAQBitextMiningQuestions(AbsTaskBitextMining):
199
199
  metadata = TaskMetadata(
200
200
  name="WebFAQBitextMiningQuestions",
201
- description="""The WebFAQ Bitext Dataset consists of natural FAQ-style Question-Answer pairs that align across languages.
202
- A sentence in the "WebFAQBitextMiningQuestions" task is the question originating from an aligned QA.
203
- The dataset is sourced from FAQ pages on the web.""",
201
+ description='The WebFAQ Bitext Dataset consists of natural FAQ-style Question-Answer pairs that align across languages. A sentence in the "WebFAQBitextMiningQuestions" task is the question originating from an aligned QA. The dataset is sourced from FAQ pages on the web.',
204
202
  reference="https://huggingface.co/PaDaS-Lab",
205
203
  dataset={
206
204
  "path": "PaDaS-Lab/webfaq-bitexts",
@@ -254,9 +252,7 @@ The dataset is sourced from FAQ pages on the web.""",
254
252
  class WebFAQBitextMiningQAs(AbsTaskBitextMining):
255
253
  metadata = TaskMetadata(
256
254
  name="WebFAQBitextMiningQAs",
257
- description="""The WebFAQ Bitext Dataset consists of natural FAQ-style Question-Answer pairs that align across languages.
258
- A sentence in the "WebFAQBitextMiningQAs" task is a concatenation of a question and its corresponding answer.
259
- The dataset is sourced from FAQ pages on the web.""",
255
+ description='The WebFAQ Bitext Dataset consists of natural FAQ-style Question-Answer pairs that align across languages. A sentence in the "WebFAQBitextMiningQAs" task is a concatenation of a question and its corresponding answer. The dataset is sourced from FAQ pages on the web.',
260
256
  reference="https://huggingface.co/PaDaS-Lab",
261
257
  dataset={
262
258
  "path": "PaDaS-Lab/webfaq-bitexts",
@@ -45,8 +45,7 @@ class AJGTV2(AbsTaskClassification):
45
45
  "path": "mteb/ajgt",
46
46
  "revision": "0a3dea7301ee0c051891f04d32f3e8577a9eae36",
47
47
  },
48
- description="""Arabic Jordanian General Tweets (AJGT) Corpus consisted of 1,800 tweets (900 for training and 900 for testing) annotated as positive and negative. Modern Standard Arabic (MSA) or Jordanian dialect.
49
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
48
+ description="Arabic Jordanian General Tweets (AJGT) Corpus consisted of 1,800 tweets (900 for training and 900 for testing) annotated as positive and negative. Modern Standard Arabic (MSA) or Jordanian dialect. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
50
49
  reference="https://link.springer.com/chapter/10.1007/978-3-319-60042-0_66/",
51
50
  type="Classification",
52
51
  category="t2c",
@@ -45,8 +45,7 @@ class HotelReviewSentimentClassificationV2(AbsTaskClassification):
45
45
  "path": "mteb/HotelReviewSentimentClassification",
46
46
  "revision": "f5e6a24acbed4182114ffdf46747090b3f51e836",
47
47
  },
48
- description="""HARD is a dataset of Arabic hotel reviews collected from the Booking.com website.
49
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
48
+ description="HARD is a dataset of Arabic hotel reviews collected from the Booking.com website. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
50
49
  reference="https://link.springer.com/chapter/10.1007/978-3-319-67056-0_3",
51
50
  type="Classification",
52
51
  category="t2c",
@@ -41,8 +41,7 @@ class OnlineStoreReviewSentimentClassificationV2(AbsTaskClassification):
41
41
  "path": "mteb/online_store_review_sentiment",
42
42
  "revision": "de0e8eed65adf1cbc58f8743a5f5c5df556de4c4",
43
43
  },
44
- description="""This dataset contains Arabic reviews of products from the SHEIN online store.
45
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
44
+ description="This dataset contains Arabic reviews of products from the SHEIN online store. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
46
45
  reference="https://huggingface.co/datasets/Ruqiya/Arabic_Reviews_of_SHEIN",
47
46
  type="Classification",
48
47
  category="t2c",
@@ -52,8 +52,7 @@ class RestaurantReviewSentimentClassificationV2(AbsTaskClassification):
52
52
  "path": "mteb/restaurant_review_sentiment",
53
53
  "revision": "5d28c1e8fb393173a849696ed178b90a6f78754a",
54
54
  },
55
- description="""Dataset of 8156 restaurant reviews from qaym.com in Arabic for sentiment analysis
56
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
55
+ description="Dataset of 8156 restaurant reviews from qaym.com in Arabic for sentiment analysis This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
57
56
  reference="https://link.springer.com/chapter/10.1007/978-3-319-18117-2_2",
58
57
  type="Classification",
59
58
  category="t2c",
@@ -45,8 +45,7 @@ class TweetEmotionClassificationV2(AbsTaskClassification):
45
45
  "path": "mteb/TweetEmotionClassification",
46
46
  "revision": "930d65840c089406ceed5241b1a9ba7294e5eeae",
47
47
  },
48
- description="""A dataset of 10,012 tweets that was created with the aim of covering the most frequently used emotion categories in Arabic tweets.
49
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
48
+ description="A dataset of 10,012 tweets that was created with the aim of covering the most frequently used emotion categories in Arabic tweets. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
50
49
  reference="https://link.springer.com/chapter/10.1007/978-3-319-77116-8_8",
51
50
  type="Classification",
52
51
  category="t2c",
@@ -62,8 +62,7 @@ class TweetSarcasmClassificationV2(AbsTaskClassification):
62
62
  "path": "mteb/tweet_sarcasm",
63
63
  "revision": "3a20898e2ea3303844e907d55f7a815a7644150d",
64
64
  },
65
- description="""Arabic sarcasm detection dataset, which was created through the reannotation of available Arabic sentiment analysis datasets.
66
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
65
+ description="Arabic sarcasm detection dataset, which was created through the reannotation of available Arabic sentiment analysis datasets. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
67
66
  reference="https://aclanthology.org/2020.osact-1.5/",
68
67
  type="Classification",
69
68
  category="t2c",
@@ -55,8 +55,7 @@ Islam, Tanvir},
55
55
  class BengaliDocumentClassificationV2(AbsTaskClassification):
56
56
  metadata = TaskMetadata(
57
57
  name="BengaliDocumentClassification.v2",
58
- description="""Dataset for News Classification, categorized with 13 domains.
59
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
58
+ description="Dataset for News Classification, categorized with 13 domains. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
60
59
  reference="https://aclanthology.org/2023.eacl-main.4",
61
60
  dataset={
62
61
  "path": "mteb/bengali_document",
@@ -45,8 +45,7 @@ class BengaliHateSpeechClassification(AbsTaskClassification):
45
45
  class BengaliHateSpeechClassificationV2(AbsTaskClassification):
46
46
  metadata = TaskMetadata(
47
47
  name="BengaliHateSpeechClassification.v2",
48
- description="""The Bengali Hate Speech Dataset is a Bengali-language dataset of news articles collected from various Bengali media sources and categorized based on the type of hate in the text.
49
- This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)""",
48
+ description="The Bengali Hate Speech Dataset is a Bengali-language dataset of news articles collected from various Bengali media sources and categorized based on the type of hate in the text. This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2632)",
50
49
  reference="https://huggingface.co/datasets/bn_hate_speech",
51
50
  dataset={
52
51
  "path": "mteb/bengali_hate_speech",