mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/_create_dataloaders.py +6 -3
  3. mteb/_evaluators/any_sts_evaluator.py +21 -12
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +1 -1
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
  7. mteb/_evaluators/pair_classification_evaluator.py +30 -38
  8. mteb/_evaluators/sklearn_evaluator.py +15 -28
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  10. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  12. mteb/abstasks/_data_filter/__init__.py +0 -0
  13. mteb/abstasks/_data_filter/filters.py +125 -0
  14. mteb/abstasks/_data_filter/task_pipelines.py +102 -0
  15. mteb/abstasks/_statistics_calculation.py +6 -2
  16. mteb/abstasks/classification.py +0 -2
  17. mteb/abstasks/clustering.py +1 -1
  18. mteb/abstasks/clustering_legacy.py +3 -0
  19. mteb/abstasks/multilabel_classification.py +10 -3
  20. mteb/abstasks/pair_classification.py +8 -1
  21. mteb/abstasks/sts.py +7 -0
  22. mteb/abstasks/task_metadata.py +1 -0
  23. mteb/benchmarks/_create_table.py +84 -37
  24. mteb/benchmarks/benchmark.py +74 -15
  25. mteb/benchmarks/benchmarks/__init__.py +8 -0
  26. mteb/benchmarks/benchmarks/benchmarks.py +259 -15
  27. mteb/benchmarks/get_benchmark.py +2 -0
  28. mteb/cache.py +47 -10
  29. mteb/deprecated_evaluator.py +8 -13
  30. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  31. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  32. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  33. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  34. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  35. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  36. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  37. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  38. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  39. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  40. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  41. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  42. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  43. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  44. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  45. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  46. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  47. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  48. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  49. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  50. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  51. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  53. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  54. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  55. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  56. mteb/evaluate.py +65 -45
  57. mteb/leaderboard/app.py +268 -133
  58. mteb/leaderboard/benchmark_selector.py +14 -5
  59. mteb/leaderboard/figures.py +13 -15
  60. mteb/leaderboard/table.py +82 -17
  61. mteb/models/__init__.py +4 -1
  62. mteb/models/abs_encoder.py +21 -17
  63. mteb/models/cache_wrappers/__init__.py +2 -1
  64. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
  65. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  66. mteb/models/get_model_meta.py +3 -114
  67. mteb/models/instruct_wrapper.py +5 -1
  68. mteb/models/model_implementations/align_models.py +7 -0
  69. mteb/models/model_implementations/amazon_models.py +1 -0
  70. mteb/models/model_implementations/andersborges.py +65 -0
  71. mteb/models/model_implementations/ara_models.py +8 -0
  72. mteb/models/model_implementations/arctic_models.py +8 -0
  73. mteb/models/model_implementations/b1ade_models.py +1 -0
  74. mteb/models/model_implementations/bedrock_models.py +4 -0
  75. mteb/models/model_implementations/bge_models.py +60 -0
  76. mteb/models/model_implementations/bica_model.py +35 -0
  77. mteb/models/model_implementations/blip2_models.py +11 -0
  78. mteb/models/model_implementations/blip_models.py +27 -0
  79. mteb/models/model_implementations/bm25.py +1 -0
  80. mteb/models/model_implementations/bmretriever_models.py +4 -0
  81. mteb/models/model_implementations/cadet_models.py +9 -0
  82. mteb/models/model_implementations/cde_models.py +14 -0
  83. mteb/models/model_implementations/clip_models.py +3 -0
  84. mteb/models/model_implementations/clips_models.py +100 -0
  85. mteb/models/model_implementations/codefuse_models.py +162 -0
  86. mteb/models/model_implementations/codesage_models.py +15 -0
  87. mteb/models/model_implementations/cohere_models.py +8 -1
  88. mteb/models/model_implementations/cohere_v.py +5 -0
  89. mteb/models/model_implementations/colpali_models.py +14 -6
  90. mteb/models/model_implementations/colqwen_models.py +271 -1
  91. mteb/models/model_implementations/colsmol_models.py +2 -0
  92. mteb/models/model_implementations/conan_models.py +1 -0
  93. mteb/models/model_implementations/dino_models.py +171 -0
  94. mteb/models/model_implementations/e5_instruct.py +4 -0
  95. mteb/models/model_implementations/e5_models.py +12 -101
  96. mteb/models/model_implementations/e5_v.py +1 -0
  97. mteb/models/model_implementations/eagerworks_models.py +164 -0
  98. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  99. mteb/models/model_implementations/en_code_retriever.py +1 -0
  100. mteb/models/model_implementations/euler_models.py +32 -0
  101. mteb/models/model_implementations/evaclip_models.py +4 -0
  102. mteb/models/model_implementations/fa_models.py +58 -0
  103. mteb/models/model_implementations/facebookai.py +193 -0
  104. mteb/models/model_implementations/geogpt_models.py +1 -0
  105. mteb/models/model_implementations/gme_v_models.py +11 -5
  106. mteb/models/model_implementations/google_models.py +16 -5
  107. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
  108. mteb/models/model_implementations/gritlm_models.py +2 -0
  109. mteb/models/model_implementations/gte_models.py +78 -0
  110. mteb/models/model_implementations/hinvec_models.py +1 -0
  111. mteb/models/model_implementations/human.py +1 -0
  112. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  113. mteb/models/model_implementations/inf_models.py +2 -0
  114. mteb/models/model_implementations/jasper_models.py +255 -2
  115. mteb/models/model_implementations/jina_clip.py +1 -0
  116. mteb/models/model_implementations/jina_models.py +209 -5
  117. mteb/models/model_implementations/kalm_models.py +203 -25
  118. mteb/models/model_implementations/kblab.py +31 -0
  119. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  120. mteb/models/model_implementations/kfst.py +25 -0
  121. mteb/models/model_implementations/kowshik24_models.py +32 -0
  122. mteb/models/model_implementations/lens_models.py +2 -0
  123. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  124. mteb/models/model_implementations/linq_models.py +3 -2
  125. mteb/models/model_implementations/listconranker.py +1 -1
  126. mteb/models/model_implementations/llm2clip_models.py +3 -0
  127. mteb/models/model_implementations/llm2vec_models.py +8 -0
  128. mteb/models/model_implementations/mcinext_models.py +3 -0
  129. mteb/models/model_implementations/mdbr_models.py +2 -0
  130. mteb/models/model_implementations/misc_models.py +362 -0
  131. mteb/models/model_implementations/mme5_models.py +1 -0
  132. mteb/models/model_implementations/moco_models.py +11 -0
  133. mteb/models/model_implementations/mod_models.py +191 -0
  134. mteb/models/model_implementations/model2vec_models.py +13 -0
  135. mteb/models/model_implementations/moka_models.py +3 -0
  136. mteb/models/model_implementations/mxbai_models.py +9 -0
  137. mteb/models/model_implementations/nbailab.py +70 -0
  138. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  139. mteb/models/model_implementations/nomic_models.py +156 -4
  140. mteb/models/model_implementations/nomic_models_vision.py +7 -2
  141. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
  142. mteb/models/model_implementations/nvidia_models.py +4 -1
  143. mteb/models/model_implementations/octen_models.py +195 -0
  144. mteb/models/model_implementations/openai_models.py +20 -16
  145. mteb/models/model_implementations/openclip_models.py +24 -0
  146. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  147. mteb/models/model_implementations/ops_moa_models.py +4 -2
  148. mteb/models/model_implementations/pawan_models.py +39 -0
  149. mteb/models/model_implementations/piccolo_models.py +8 -0
  150. mteb/models/model_implementations/promptriever_models.py +8 -4
  151. mteb/models/model_implementations/pylate_models.py +37 -4
  152. mteb/models/model_implementations/qodo_models.py +2 -0
  153. mteb/models/model_implementations/qtack_models.py +1 -0
  154. mteb/models/model_implementations/qwen3_models.py +6 -3
  155. mteb/models/model_implementations/qzhou_models.py +3 -1
  156. mteb/models/model_implementations/random_baseline.py +16 -21
  157. mteb/models/model_implementations/rasgaard_models.py +34 -0
  158. mteb/models/model_implementations/reasonir_model.py +1 -0
  159. mteb/models/model_implementations/repllama_models.py +2 -0
  160. mteb/models/model_implementations/rerankers_custom.py +3 -3
  161. mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
  162. mteb/models/model_implementations/richinfoai_models.py +1 -0
  163. mteb/models/model_implementations/ru_sentence_models.py +51 -0
  164. mteb/models/model_implementations/ruri_models.py +322 -0
  165. mteb/models/model_implementations/salesforce_models.py +3 -0
  166. mteb/models/model_implementations/samilpwc_models.py +1 -0
  167. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  168. mteb/models/model_implementations/searchmap_models.py +1 -0
  169. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  170. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
  171. mteb/models/model_implementations/seed_models.py +1 -0
  172. mteb/models/model_implementations/sentence_transformers_models.py +57 -0
  173. mteb/models/model_implementations/shuu_model.py +32 -31
  174. mteb/models/model_implementations/siglip_models.py +10 -0
  175. mteb/models/model_implementations/sonar_models.py +1 -0
  176. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  177. mteb/models/model_implementations/stella_models.py +6 -0
  178. mteb/models/model_implementations/tarka_models.py +376 -0
  179. mteb/models/model_implementations/ua_sentence_models.py +10 -0
  180. mteb/models/model_implementations/uae_models.py +1 -0
  181. mteb/models/model_implementations/vdr_models.py +2 -0
  182. mteb/models/model_implementations/vi_vn_models.py +39 -0
  183. mteb/models/model_implementations/vista_models.py +2 -0
  184. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  185. mteb/models/model_implementations/voyage_models.py +15 -0
  186. mteb/models/model_implementations/voyage_v.py +8 -2
  187. mteb/models/model_implementations/xyz_models.py +1 -0
  188. mteb/models/model_implementations/youtu_models.py +1 -0
  189. mteb/models/model_implementations/yuan_models.py +34 -0
  190. mteb/models/model_implementations/yuan_models_en.py +58 -0
  191. mteb/models/model_meta.py +442 -22
  192. mteb/models/search_encoder_index/__init__.py +7 -0
  193. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  194. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  195. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  196. mteb/models/search_wrappers.py +165 -48
  197. mteb/models/sentence_transformer_wrapper.py +2 -7
  198. mteb/results/benchmark_results.py +88 -47
  199. mteb/results/model_result.py +11 -4
  200. mteb/results/task_result.py +37 -19
  201. mteb/similarity_functions.py +49 -0
  202. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  203. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  204. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  205. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  206. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  207. mteb/tasks/classification/ara/ajgt.py +1 -2
  208. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  209. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  210. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  211. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  212. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  213. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  214. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  215. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  216. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  217. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  218. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  219. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  220. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  221. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  222. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  223. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  224. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  225. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  226. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  227. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  228. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  229. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  230. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  231. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  232. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  233. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  234. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  235. mteb/tasks/classification/eng/news_classification.py +1 -2
  236. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  237. mteb/tasks/classification/eng/patent_classification.py +1 -2
  238. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  239. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  240. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  241. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  242. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  243. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  244. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  245. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  246. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  247. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  248. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  249. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  250. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  251. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  252. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  253. mteb/tasks/classification/est/estonian_valence.py +1 -2
  254. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  255. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  257. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  260. mteb/tasks/classification/heb/__init__.py +6 -1
  261. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  262. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  263. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  264. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  265. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  266. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  267. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  268. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  269. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  270. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  271. mteb/tasks/classification/kor/klue_tc.py +1 -2
  272. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  274. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  275. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  276. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  277. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  278. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  279. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  280. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  281. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  282. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  283. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  284. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  285. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  286. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  287. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  288. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  289. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  290. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  291. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  292. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  293. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  294. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  295. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  296. mteb/tasks/classification/pol/polish_classification.py +3 -6
  297. mteb/tasks/classification/ron/moroco.py +1 -2
  298. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  299. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  300. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  301. mteb/tasks/classification/rus/headline_classification.py +1 -2
  302. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  303. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  304. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  305. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  306. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  307. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  308. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  309. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  310. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  311. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  312. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  313. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  314. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  315. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  316. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  317. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  318. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  319. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  320. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  321. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  322. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  323. mteb/tasks/classification/tur/__init__.py +4 -0
  324. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  325. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  326. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  327. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  328. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  329. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  330. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  331. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  332. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  333. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  334. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  335. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  336. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  337. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  338. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  339. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  340. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  341. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  342. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  343. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  344. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  345. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  346. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  347. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  348. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  349. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  350. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  351. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  352. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  353. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  354. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  355. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  356. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  357. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  358. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  359. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  360. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  361. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  362. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  363. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  364. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  365. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  366. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  367. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  368. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  369. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  370. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  371. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  372. mteb/tasks/pair_classification/rus/terra.py +51 -25
  373. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  374. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  375. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  376. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  377. mteb/tasks/reranking/jpn/__init__.py +9 -1
  378. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  379. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  380. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  381. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  382. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  383. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  384. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  385. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  386. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  387. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  388. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  389. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  390. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  391. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  392. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  393. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  394. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  395. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  396. mteb/tasks/retrieval/kor/__init__.py +2 -1
  397. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  398. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  399. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  400. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  401. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  402. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  403. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  404. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  405. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  406. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  407. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  408. mteb/tasks/retrieval/nld/__init__.py +8 -4
  409. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  410. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  411. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  412. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  413. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  414. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  415. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  416. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  417. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  418. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  419. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  420. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  421. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  422. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  423. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  424. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  425. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  426. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  427. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  428. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  429. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  430. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  431. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  432. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  433. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  434. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  435. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  436. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  437. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  438. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  439. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  440. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  441. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  442. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  443. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  444. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  445. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  446. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  447. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  448. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  449. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  450. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  451. mteb/types/_encoder_io.py +7 -2
  452. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
  453. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
  454. mteb/models/model_implementations/nb_sbert.py +0 -25
  455. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
  456. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
  457. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
  458. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,8 @@ from mteb.benchmarks.benchmarks.benchmarks import (
12
12
  FA_MTEB_2,
13
13
  HUME,
14
14
  JINA_VDR,
15
+ JMTEB_LITE_V1,
16
+ JMTEB_V2,
15
17
  LONG_EMBED,
16
18
  MIEB_ENG,
17
19
  MIEB_IMG,
@@ -38,10 +40,12 @@ from mteb.benchmarks.benchmarks.benchmarks import (
38
40
  SEB,
39
41
  VIDORE,
40
42
  VIDORE_V2,
43
+ VIDORE_V3,
41
44
  VISUAL_DOCUMENT_RETRIEVAL,
42
45
  VN_MTEB,
43
46
  CoIR,
44
47
  MTEB_code,
48
+ MTEB_MAIN_RU_v1_1,
45
49
  MTEB_multilingual_v1,
46
50
  MTEB_multilingual_v2,
47
51
  RAR_b,
@@ -73,6 +77,8 @@ __all__ = [
73
77
  "HUME",
74
78
  "HUME",
75
79
  "JINA_VDR",
80
+ "JMTEB_LITE_V1",
81
+ "JMTEB_V2",
76
82
  "LONG_EMBED",
77
83
  "MIEB_ENG",
78
84
  "MIEB_IMG",
@@ -108,9 +114,11 @@ __all__ = [
108
114
  "SEB",
109
115
  "VIDORE",
110
116
  "VIDORE_V2",
117
+ "VIDORE_V3",
111
118
  "VISUAL_DOCUMENT_RETRIEVAL",
112
119
  "VN_MTEB",
113
120
  "CoIR",
121
+ "MTEB_MAIN_RU_v1_1",
114
122
  "MTEB_code",
115
123
  "MTEB_multilingual_v1",
116
124
  "MTEB_multilingual_v2",
@@ -1,4 +1,9 @@
1
- from mteb.benchmarks.benchmark import Benchmark, HUMEBenchmark, MIEBBenchmark
1
+ from mteb.benchmarks.benchmark import (
2
+ Benchmark,
3
+ HUMEBenchmark,
4
+ MIEBBenchmark,
5
+ VidoreBenchmark,
6
+ )
2
7
  from mteb.get_tasks import MTEBTasks, get_task, get_tasks
3
8
 
4
9
  MMTEB_CITATION = r"""@article{enevoldsen2025mmtebmassivemultilingualtext,
@@ -180,7 +185,7 @@ We recommend that you use [MTEB(eng, v2)](http://mteb-leaderboard.hf.space/?benc
180
185
 
181
186
  MTEB_MAIN_RU = Benchmark(
182
187
  name="MTEB(rus, v1)",
183
- display_name="Russian",
188
+ display_name="Russian legacy",
184
189
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ru.svg",
185
190
  tasks=MTEBTasks(
186
191
  get_tasks(
@@ -235,6 +240,67 @@ MTEB_MAIN_RU = Benchmark(
235
240
  year = {2024},
236
241
  }
237
242
  """,
243
+ contacts=["Samoed", "artemsnegirev", "Drozhzhinastya"],
244
+ )
245
+
246
+ MTEB_MAIN_RU_v1_1 = Benchmark(
247
+ name="MTEB(rus, v1.1)",
248
+ display_name="Russian",
249
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ru.svg",
250
+ tasks=MTEBTasks(
251
+ get_tasks(
252
+ languages=["rus"],
253
+ tasks=[
254
+ # Classification
255
+ "GeoreviewClassification",
256
+ "HeadlineClassification",
257
+ "InappropriatenessClassification",
258
+ "KinopoiskClassification",
259
+ "MassiveIntentClassification",
260
+ "MassiveScenarioClassification",
261
+ "RuReviewsClassification",
262
+ "RuSciBenchGRNTIClassification",
263
+ "RuSciBenchOECDClassification",
264
+ # Clustering
265
+ "GeoreviewClusteringP2P",
266
+ "RuSciBenchGRNTIClusteringP2P",
267
+ "RuSciBenchOECDClusteringP2P",
268
+ # MultiLabelClassification
269
+ "CEDRClassification",
270
+ "SensitiveTopicsClassification",
271
+ # PairClassification
272
+ "TERRa",
273
+ # Reranking
274
+ "MIRACLReranking",
275
+ "RuBQReranking",
276
+ # Retrieval
277
+ "MIRACLRetrievalHardNegatives.v2",
278
+ "RiaNewsRetrievalHardNegatives.v2",
279
+ "RuBQRetrieval",
280
+ # STS
281
+ "RUParaPhraserSTS",
282
+ "STS22",
283
+ ],
284
+ )
285
+ + get_tasks(
286
+ tasks=["RuSTSBenchmarkSTS"],
287
+ eval_splits=["test"],
288
+ )
289
+ ),
290
+ description="A Russian version of the Massive Text Embedding Benchmark covering the task categories of classification, clustering, reranking, pair classification, retrieval, and semantic similarity. In v1.1, MIRACLRetrieval and RiaNewsRetrieval were replaced with their HardNegatives variants for improved time-optimization measurement. MIRACLRetrievalHardNegatives and RiaNewsRetrievalHardNegatives are used in their updated versions (v2), both of which include improved default prompts.",
291
+ reference="https://aclanthology.org/2023.eacl-main.148/",
292
+ citation=r"""
293
+ @misc{snegirev2024russianfocusedembeddersexplorationrumteb,
294
+ archiveprefix = {arXiv},
295
+ author = {Artem Snegirev and Maria Tikhonova and Anna Maksimova and Alena Fenogenova and Alexander Abramov},
296
+ eprint = {2408.12503},
297
+ primaryclass = {cs.CL},
298
+ title = {The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design},
299
+ url = {https://arxiv.org/abs/2408.12503},
300
+ year = {2024},
301
+ }
302
+ """,
303
+ contacts=["Samoed", "artemsnegirev", "Drozhzhinastya"],
238
304
  )
239
305
 
240
306
 
@@ -243,7 +309,7 @@ RU_SCI_BENCH = Benchmark(
243
309
  tasks=get_tasks(
244
310
  tasks=[
245
311
  # BitextMining
246
- "RuSciBenchBitextMining",
312
+ "RuSciBenchBitextMining.v2",
247
313
  # Classification
248
314
  "RuSciBenchCoreRiscClassification",
249
315
  "RuSciBenchGRNTIClassification.v2",
@@ -369,7 +435,7 @@ MTEB_RETRIEVAL_MEDICAL = Benchmark(
369
435
  ],
370
436
  ),
371
437
  description="A curated set of MTEB tasks designed to evaluate systems in the context of medical information retrieval.",
372
- reference="",
438
+ reference=None,
373
439
  citation=None,
374
440
  )
375
441
 
@@ -405,6 +471,7 @@ SEB = Benchmark(
405
471
  name="MTEB(Scandinavian, v1)",
406
472
  display_name="Scandinavian",
407
473
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/dk.svg",
474
+ language_view=["dan-Latn", "swe-Latn", "nno-Latn", "nob-Latn"],
408
475
  tasks=get_tasks(
409
476
  tasks=[
410
477
  # Bitext
@@ -887,6 +954,28 @@ MTEB_multilingual_v1 = Benchmark(
887
954
  MTEB_multilingual_v2 = Benchmark(
888
955
  name="MTEB(Multilingual, v2)",
889
956
  display_name="Multilingual",
957
+ language_view=[
958
+ "eng-Latn", # English
959
+ "zho-Hans", # Chinese (Simplified)
960
+ "hin-Deva", # Hindi
961
+ "spa-Latn", # Spanish
962
+ "fra-Latn", # French
963
+ "ara-Arab", # Arabic
964
+ "ben-Beng", # Bengali
965
+ "rus-Cyrl", # Russian
966
+ "por-Latn", # Portuguese
967
+ "urd-Arab", # Urdu
968
+ "ind-Latn", # Indonesian
969
+ "deu-Latn", # German
970
+ "jpn-Jpan", # Japanese
971
+ "swa-Latn", # Swahili
972
+ "mar-Deva", # Marathi
973
+ "tel-Telu", # Telugu
974
+ "tur-Latn", # Turkish
975
+ "tam-Taml", # Tamil
976
+ "vie-Latn", # Vietnamese
977
+ "kor-Hang", # Korean
978
+ ],
890
979
  icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-globe.svg",
891
980
  tasks=mteb_multilingual_tasks,
892
981
  description="A large-scale multilingual expansion of MTEB, driven mainly by highly-curated community contributions covering 250+ languages. ",
@@ -897,7 +986,7 @@ MTEB_multilingual_v2 = Benchmark(
897
986
 
898
987
  MTEB_JPN = Benchmark(
899
988
  name="MTEB(jpn, v1)",
900
- display_name="Japanese",
989
+ display_name="Japanese Legacy",
901
990
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg",
902
991
  tasks=get_tasks(
903
992
  languages=["jpn"],
@@ -1642,7 +1731,7 @@ MTEB_NL = Benchmark(
1642
1731
  exclusive_language_filter=True,
1643
1732
  tasks=[
1644
1733
  # Classification
1645
- "DutchBookReviewSentimentClassification",
1734
+ "DutchBookReviewSentimentClassification.v2",
1646
1735
  "MassiveIntentClassification",
1647
1736
  "MassiveScenarioClassification",
1648
1737
  "SIB200Classification",
@@ -1673,10 +1762,10 @@ MTEB_NL = Benchmark(
1673
1762
  # # Reranking
1674
1763
  "WikipediaRerankingMultilingual",
1675
1764
  # # Retrieval
1676
- "ArguAna-NL",
1677
- "SCIDOCS-NL",
1678
- "SciFact-NL",
1679
- "NFCorpus-NL",
1765
+ "ArguAna-NL.v2",
1766
+ "SCIDOCS-NL.v2",
1767
+ "SciFact-NL.v2",
1768
+ "NFCorpus-NL.v2",
1680
1769
  "BelebeleRetrieval",
1681
1770
  "WebFAQRetrieval",
1682
1771
  "DutchNewsArticlesRetrieval",
@@ -2214,10 +2303,51 @@ VIDORE_V2 = Benchmark(
2214
2303
  """,
2215
2304
  )
2216
2305
 
2217
- VISUAL_DOCUMENT_RETRIEVAL = Benchmark(
2218
- name="VisualDocumentRetrieval",
2219
- display_name="Visual Document Retrieval",
2220
- icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-picture.svg",
2306
+ VIDORE_V3 = VidoreBenchmark(
2307
+ name="ViDoRe(v3)",
2308
+ display_name="ViDoRe V3",
2309
+ language_view=[
2310
+ "deu-Latn",
2311
+ "eng-Latn",
2312
+ "fra-Latn",
2313
+ "ita-Latn",
2314
+ "por-Latn",
2315
+ "spa-Latn",
2316
+ ],
2317
+ icon="https://cdn-uploads.huggingface.co/production/uploads/66e16a677c2eb2da5109fb5c/x99xqw__fl2UaPbiIdC_f.png",
2318
+ tasks=get_tasks(
2319
+ tasks=[
2320
+ "Vidore3FinanceEnRetrieval",
2321
+ "Vidore3IndustrialRetrieval",
2322
+ "Vidore3ComputerScienceRetrieval",
2323
+ "Vidore3PharmaceuticalsRetrieval",
2324
+ "Vidore3HrRetrieval",
2325
+ "Vidore3FinanceFrRetrieval",
2326
+ "Vidore3PhysicsRetrieval",
2327
+ "Vidore3EnergyRetrieval",
2328
+ "Vidore3TelecomRetrieval",
2329
+ "Vidore3NuclearRetrieval",
2330
+ ]
2331
+ ),
2332
+ description="ViDoRe V3 sets a new industry gold standard for multi-modal, enterprise document visual retrieval evaluation. It addresses a critical challenge in production RAG systems: retrieving accurate information from complex, visually-rich documents. The benchmark includes both open and closed datasets: to submit results on private tasks, please [open an issue](https://github.com/embeddings-benchmark/mteb/issues?template=eval_request.yaml).",
2333
+ reference="https://huggingface.co/blog/QuentinJG/introducing-vidore-v3",
2334
+ citation=r"""
2335
+ @misc{mace2025vidorev3,
2336
+ author = {Macé, Quentin and Loison, Antonio and EDY, Antoine and Xing, Victor and Viaud, Gautier},
2337
+ day = {5},
2338
+ howpublished = {\url{https://huggingface.co/blog/QuentinJG/introducing-vidore-v3}},
2339
+ journal = {Hugging Face Blog},
2340
+ month = {November},
2341
+ publisher = {Hugging Face},
2342
+ title = {ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-cases},
2343
+ year = {2025},
2344
+ }
2345
+ """,
2346
+ )
2347
+
2348
+ VISUAL_DOCUMENT_RETRIEVAL = VidoreBenchmark(
2349
+ name="ViDoRe(v1&v2)",
2350
+ display_name="ViDoRe (V1&V2)",
2221
2351
  tasks=get_tasks(
2222
2352
  tasks=[
2223
2353
  # v1
@@ -2459,7 +2589,121 @@ HUME = HUMEBenchmark(
2459
2589
  ],
2460
2590
  ),
2461
2591
  description="The HUME benchmark is designed to evaluate the performance of text embedding models and humans on a comparable set of tasks. This captures areas where models perform better than human annotators and the reverse. In the paper, we go further into the analysis and what conclusions can be drawn.",
2462
- reference="Coming soon (in review)",
2592
+ reference=None,
2463
2593
  citation=None,
2464
2594
  contacts=["AdnanElAssadi56", "KennethEnevoldsen", "isaac-chung", "Samoed"],
2465
2595
  )
2596
+
2597
+ JMTEB_V2 = Benchmark(
2598
+ name="JMTEB(v2)",
2599
+ display_name="Japanese",
2600
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg",
2601
+ tasks=get_tasks(
2602
+ languages=["jpn"],
2603
+ tasks=[
2604
+ # Clustering (3)
2605
+ "LivedoorNewsClustering.v2",
2606
+ "MewsC16JaClustering",
2607
+ "SIB200ClusteringS2S",
2608
+ # Classification (7)
2609
+ "AmazonReviewsClassification",
2610
+ "AmazonCounterfactualClassification",
2611
+ "MassiveIntentClassification",
2612
+ "MassiveScenarioClassification",
2613
+ "JapaneseSentimentClassification",
2614
+ "SIB200Classification",
2615
+ "WRIMEClassification",
2616
+ # STS (2)
2617
+ "JSTS",
2618
+ "JSICK",
2619
+ # Retrieval (11)
2620
+ "JaqketRetrieval",
2621
+ "MrTidyRetrieval",
2622
+ "JaGovFaqsRetrieval",
2623
+ "NLPJournalTitleAbsRetrieval.V2",
2624
+ "NLPJournalTitleIntroRetrieval.V2",
2625
+ "NLPJournalAbsIntroRetrieval.V2",
2626
+ "NLPJournalAbsArticleRetrieval.V2",
2627
+ "JaCWIRRetrieval",
2628
+ "MIRACLRetrieval",
2629
+ "MintakaRetrieval",
2630
+ "MultiLongDocRetrieval",
2631
+ # Reranking (5)
2632
+ "ESCIReranking",
2633
+ "JQaRAReranking",
2634
+ "JaCWIRReranking",
2635
+ "MIRACLReranking",
2636
+ "MultiLongDocReranking",
2637
+ ],
2638
+ ),
2639
+ description="JMTEB is a benchmark for evaluating Japanese text embedding models. In v2, we have extended the benchmark to 28 datasets, enabling more comprehensive evaluation compared with v1 (MTEB(jpn, v1)).",
2640
+ reference="https://github.com/sbintuitions/JMTEB",
2641
+ citation=r"""
2642
+ @article{li2025jmteb,
2643
+ author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan and Fukuchi, Akihiko and Shibata, Tomohide and Kawahara, Daisuke},
2644
+ issue = {3},
2645
+ journal = {Vol.2025-NL-265,No.3,1-15},
2646
+ month = {sep},
2647
+ title = {{JMTEB and JMTEB-lite: Japanese Massive Text Embedding Benchmark and Its Lightweight Version}},
2648
+ year = {2025},
2649
+ }
2650
+ """,
2651
+ contacts=["lsz05"],
2652
+ )
2653
+
2654
+ JMTEB_LITE_V1 = Benchmark(
2655
+ name="JMTEB-lite(v1)",
2656
+ display_name="Japanese",
2657
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg",
2658
+ tasks=get_tasks(
2659
+ languages=["jpn"],
2660
+ tasks=[
2661
+ # Clustering (3)
2662
+ "LivedoorNewsClustering.v2",
2663
+ "MewsC16JaClustering",
2664
+ "SIB200ClusteringS2S",
2665
+ # Classification (7)
2666
+ "AmazonReviewsClassification",
2667
+ "AmazonCounterfactualClassification",
2668
+ "MassiveIntentClassification",
2669
+ "MassiveScenarioClassification",
2670
+ "JapaneseSentimentClassification",
2671
+ "SIB200Classification",
2672
+ "WRIMEClassification",
2673
+ # STS (2)
2674
+ "JSTS",
2675
+ "JSICK",
2676
+ # Retrieval (11)
2677
+ "JaqketRetrievalLite",
2678
+ "MrTyDiJaRetrievalLite",
2679
+ "JaGovFaqsRetrieval",
2680
+ "NLPJournalTitleAbsRetrieval.V2",
2681
+ "NLPJournalTitleIntroRetrieval.V2",
2682
+ "NLPJournalAbsIntroRetrieval.V2",
2683
+ "NLPJournalAbsArticleRetrieval.V2",
2684
+ "JaCWIRRetrievalLite",
2685
+ "MIRACLJaRetrievalLite",
2686
+ "MintakaRetrieval",
2687
+ "MultiLongDocRetrieval",
2688
+ # Reranking (5)
2689
+ "ESCIReranking",
2690
+ "JQaRARerankingLite",
2691
+ "JaCWIRRerankingLite",
2692
+ "MIRACLReranking",
2693
+ "MultiLongDocReranking",
2694
+ ],
2695
+ ),
2696
+ description="JMTEB-lite is a lightweight version of JMTEB. It makes agile evaluation possible by reaching an average of 5x faster evaluation comparing with JMTEB, as 6 heavy datasets in JMTEB are optimized with hard negative pooling strategy, making them much smaller. The result of JMTEB-lite is proved to be highly relevant with that of JMTEB, making it a faithful preview of JMTEB.",
2697
+ reference="https://huggingface.co/datasets/sbintuitions/JMTEB-lite",
2698
+ citation=r"""
2699
+ @article{li2025jmteb,
2700
+ author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan and Fukuchi, Akihiko and Shibata, Tomohide and Kawahara, Daisuke},
2701
+ issue = {3},
2702
+ journal = {Vol.2025-NL-265,No.3,1-15},
2703
+ month = {sep},
2704
+ title = {{JMTEB and JMTEB-lite: Japanese Massive Text Embedding Benchmark and Its Lightweight Version}},
2705
+ year = {2025},
2706
+ }
2707
+ """,
2708
+ contacts=["lsz05"],
2709
+ )
@@ -39,6 +39,7 @@ def _get_previous_benchmark_names() -> dict[str, str]:
39
39
  MTEB_RETRIEVAL_MEDICAL,
40
40
  MTEB_RETRIEVAL_WITH_INSTRUCTIONS,
41
41
  SEB,
42
+ VISUAL_DOCUMENT_RETRIEVAL,
42
43
  MTEB_code,
43
44
  MTEB_multilingual_v2,
44
45
  )
@@ -63,6 +64,7 @@ def _get_previous_benchmark_names() -> dict[str, str]:
63
64
  "MTEB(Chinese)": C_MTEB.name,
64
65
  "FaMTEB(fas, beta)": FA_MTEB.name,
65
66
  "BRIGHT(long)": BRIGHT_LONG.name,
67
+ "VisualDocumentRetrieval": VISUAL_DOCUMENT_RETRIEVAL.name,
66
68
  }
67
69
  return previous_benchmark_names
68
70
 
mteb/cache.py CHANGED
@@ -8,7 +8,9 @@ from collections.abc import Sequence
8
8
  from pathlib import Path
9
9
  from typing import cast
10
10
 
11
+ import mteb
11
12
  from mteb.abstasks import AbsTask
13
+ from mteb.benchmarks.benchmark import Benchmark
12
14
  from mteb.models import ModelMeta
13
15
  from mteb.results import BenchmarkResults, ModelResult, TaskResult
14
16
  from mteb.types import ModelName, Revision
@@ -62,7 +64,11 @@ class ResultCache:
62
64
  Returns:
63
65
  The path to the results of the task.
64
66
  """
65
- results_folder = "results" if not remote else "remote"
67
+ results_folder = (
68
+ self.cache_path / "results"
69
+ if not remote
70
+ else self.cache_path / "remote" / "results"
71
+ )
66
72
 
67
73
  if isinstance(model_name, ModelMeta):
68
74
  if model_revision is not None:
@@ -74,7 +80,7 @@ class ResultCache:
74
80
  elif isinstance(model_name, str):
75
81
  model_name = model_name.replace("/", "__").replace(" ", "_")
76
82
 
77
- model_path = self.cache_path / results_folder / model_name
83
+ model_path = results_folder / model_name
78
84
 
79
85
  if model_revision is None:
80
86
  logger.warning(
@@ -191,12 +197,14 @@ class ResultCache:
191
197
  self,
192
198
  remote: str = "https://github.com/embeddings-benchmark/results",
193
199
  download_latest: bool = True,
200
+ revision: str | None = None,
194
201
  ) -> Path:
195
202
  """Downloads the latest version of the results repository from GitHub to a local cache directory. Required git to be installed.
196
203
 
197
204
  Args:
198
205
  remote: The URL of the results repository on GitHub.
199
206
  download_latest: If True it will download the latest version of the repository, otherwise it will only update the existing repository.
207
+ revision: If specified, it will checkout the given revision after cloning or pulling the repository.
200
208
 
201
209
  Returns:
202
210
  The path to the local cache directory.
@@ -224,14 +232,27 @@ class ResultCache:
224
232
  )
225
233
  raise ValueError(msg)
226
234
 
227
- if download_latest:
235
+ if revision or download_latest:
228
236
  logger.info(
229
- f"remote repository already exists in {results_directory}, updating it using git pull"
237
+ f"remote repository already exists in {results_directory}, fetching updates"
238
+ )
239
+ subprocess.run(
240
+ ["git", "fetch", "--all", "--tags"],
241
+ cwd=results_directory,
242
+ check=True,
230
243
  )
231
- subprocess.run(["git", "pull"], cwd=results_directory)
232
244
  else:
233
245
  logger.debug(
234
- f"Results repository already exists in {results_directory}, skipping update, set download_latest=True to update it"
246
+ f"Results repository already exists in {results_directory}, skipping update, "
247
+ f"set download_latest=True to update it"
248
+ )
249
+
250
+ if revision:
251
+ logger.info(f"Checking out revision '{revision}'")
252
+ subprocess.run(
253
+ ["git", "checkout", revision],
254
+ cwd=results_directory,
255
+ check=True,
235
256
  )
236
257
  return results_directory
237
258
 
@@ -239,7 +260,18 @@ class ResultCache:
239
260
  f"No results repository found in {results_directory}, cloning it from {remote}"
240
261
  )
241
262
 
242
- subprocess.run(["git", "clone", remote, "remote"], cwd=self.cache_path)
263
+ clone_cmd = ["git", "clone", "--depth", "1"]
264
+
265
+ if revision:
266
+ logger.info(f"Cloning repository at revision '{revision}'")
267
+ clone_cmd.append(f"--revision={revision}")
268
+ clone_cmd.extend([remote, "remote"])
269
+
270
+ subprocess.run(
271
+ clone_cmd,
272
+ cwd=self.cache_path,
273
+ check=True,
274
+ )
243
275
 
244
276
  return results_directory
245
277
 
@@ -435,7 +467,7 @@ class ResultCache:
435
467
  def load_results(
436
468
  self,
437
469
  models: Sequence[str] | Sequence[ModelMeta] | None = None,
438
- tasks: Sequence[str] | Sequence[AbsTask] | None = None,
470
+ tasks: Sequence[str] | Sequence[AbsTask] | Benchmark | str | None = None,
439
471
  require_model_meta: bool = True,
440
472
  include_remote: bool = True,
441
473
  validate_and_filter: bool = False,
@@ -445,7 +477,8 @@ class ResultCache:
445
477
 
446
478
  Args:
447
479
  models: A list of model names to load the results for. If None it will load the results for all models.
448
- tasks: A list of task names to load the results for. If None it will load the results for all tasks.
480
+ tasks: A list of task names to load the results for. If str is passed, then benchmark will be loaded.
481
+ If None it will load the results for all tasks.
449
482
  require_model_meta: If True it will ignore results that do not have a model_meta.json file. If false it attempt to
450
483
  extract the model name and revision from the path.
451
484
  include_remote: If True, it will include results from the remote repository.
@@ -467,6 +500,9 @@ class ResultCache:
467
500
  ... require_model_meta=True,
468
501
  ... )
469
502
  """
503
+ if isinstance(tasks, str):
504
+ tasks = mteb.get_benchmark(tasks)
505
+
470
506
  paths = self.get_cache_paths(
471
507
  models=models,
472
508
  tasks=tasks,
@@ -495,7 +531,7 @@ class ResultCache:
495
531
  if validate_and_filter:
496
532
  task = task_names[task_result.task_name]
497
533
  try:
498
- task_result.validate_and_filter_scores(task=task)
534
+ task_result = task_result.validate_and_filter_scores(task=task)
499
535
  except Exception as e:
500
536
  logger.info(
501
537
  f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}"
@@ -516,6 +552,7 @@ class ResultCache:
516
552
 
517
553
  benchmark_results = BenchmarkResults(
518
554
  model_results=models_results,
555
+ benchmark=tasks if isinstance(tasks, Benchmark) else None,
519
556
  )
520
557
 
521
558
  return benchmark_results
@@ -13,21 +13,11 @@ from pathlib import Path
13
13
  from time import time
14
14
  from typing import TYPE_CHECKING, Any
15
15
 
16
- from mteb.abstasks.task_metadata import TaskCategory, TaskType
17
- from mteb.models.get_model_meta import (
18
- _model_meta_from_cross_encoder,
19
- _model_meta_from_sentence_transformers,
20
- )
21
-
22
- if sys.version_info >= (3, 13):
23
- from warnings import deprecated
24
- else:
25
- from typing_extensions import deprecated
26
-
27
16
  import datasets
28
17
 
29
18
  import mteb
30
19
  from mteb.abstasks import AbsTask
20
+ from mteb.abstasks.task_metadata import TaskCategory, TaskType
31
21
  from mteb.benchmarks import Benchmark
32
22
  from mteb.models import (
33
23
  CrossEncoderWrapper,
@@ -39,6 +29,11 @@ from mteb.models import (
39
29
  from mteb.results import TaskResult
40
30
  from mteb.types import ScoresDict
41
31
 
32
+ if sys.version_info >= (3, 13):
33
+ from warnings import deprecated
34
+ else:
35
+ from typing_extensions import deprecated
36
+
42
37
  if TYPE_CHECKING:
43
38
  from sentence_transformers import CrossEncoder, SentenceTransformer
44
39
 
@@ -669,9 +664,9 @@ class MTEB:
669
664
  from sentence_transformers import CrossEncoder, SentenceTransformer
670
665
 
671
666
  if isinstance(model, CrossEncoder):
672
- meta = _model_meta_from_cross_encoder(model)
667
+ meta = ModelMeta.from_cross_encoder(model)
673
668
  elif isinstance(model, SentenceTransformer):
674
- meta = _model_meta_from_sentence_transformers(model)
669
+ meta = ModelMeta.from_sentence_transformer_model(model)
675
670
  else:
676
671
  meta = ModelMeta(
677
672
  loader=None,
@@ -0,0 +1,61 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 19928,
4
+ "number_of_characters": 35466331,
5
+ "unique_pairs": 19928,
6
+ "sentence1_statistics": {
7
+ "total_text_length": 17733346,
8
+ "min_text_length": 103,
9
+ "average_text_length": 889.8708350060217,
10
+ "max_text_length": 11576,
11
+ "unique_texts": 19928
12
+ },
13
+ "sentence2_statistics": {
14
+ "total_text_length": 17732985,
15
+ "min_text_length": 103,
16
+ "average_text_length": 889.8527197912485,
17
+ "max_text_length": 11576,
18
+ "unique_texts": 19928
19
+ },
20
+ "hf_subset_descriptive_stats": {
21
+ "ru-en": {
22
+ "num_samples": 9965,
23
+ "number_of_characters": 17734926,
24
+ "unique_pairs": 9965,
25
+ "sentence1_statistics": {
26
+ "total_text_length": 8685585,
27
+ "min_text_length": 103,
28
+ "average_text_length": 871.6091319618665,
29
+ "max_text_length": 5675,
30
+ "unique_texts": 9965
31
+ },
32
+ "sentence2_statistics": {
33
+ "total_text_length": 9049341,
34
+ "min_text_length": 106,
35
+ "average_text_length": 908.1124937280482,
36
+ "max_text_length": 11576,
37
+ "unique_texts": 9965
38
+ }
39
+ },
40
+ "en-ru": {
41
+ "num_samples": 9963,
42
+ "number_of_characters": 17731405,
43
+ "unique_pairs": 9963,
44
+ "sentence1_statistics": {
45
+ "total_text_length": 9047761,
46
+ "min_text_length": 106,
47
+ "average_text_length": 908.1362039546322,
48
+ "max_text_length": 11576,
49
+ "unique_texts": 9963
50
+ },
51
+ "sentence2_statistics": {
52
+ "total_text_length": 8683644,
53
+ "min_text_length": 103,
54
+ "average_text_length": 871.5892803372478,
55
+ "max_text_length": 5675,
56
+ "unique_texts": 9963
57
+ }
58
+ }
59
+ }
60
+ }
61
+ }