mteb 2.0.5__py3-none-any.whl → 2.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (412) hide show
  1. mteb/__init__.py +10 -1
  2. mteb/_create_dataloaders.py +8 -3
  3. mteb/_evaluators/any_sts_evaluator.py +14 -12
  4. mteb/_evaluators/clustering_evaluator.py +1 -1
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +2 -2
  6. mteb/_evaluators/pair_classification_evaluator.py +3 -1
  7. mteb/_evaluators/retrieval_metrics.py +0 -9
  8. mteb/_evaluators/sklearn_evaluator.py +15 -28
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  10. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  12. mteb/abstasks/_stratification.py +1 -1
  13. mteb/abstasks/abstask.py +6 -1
  14. mteb/abstasks/clustering.py +1 -1
  15. mteb/abstasks/dataset_card_template.md +1 -1
  16. mteb/abstasks/multilabel_classification.py +2 -2
  17. mteb/abstasks/retrieval.py +2 -1
  18. mteb/abstasks/retrieval_dataset_loaders.py +1 -1
  19. mteb/abstasks/task_metadata.py +2 -1
  20. mteb/benchmarks/_create_table.py +1 -3
  21. mteb/benchmarks/benchmark.py +18 -1
  22. mteb/benchmarks/benchmarks/__init__.py +4 -0
  23. mteb/benchmarks/benchmarks/benchmarks.py +125 -16
  24. mteb/benchmarks/get_benchmark.py +3 -1
  25. mteb/cache.py +7 -3
  26. mteb/descriptive_stats/Classification/DutchColaClassification.json +54 -0
  27. mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json +54 -0
  28. mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json +90 -0
  29. mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json +54 -0
  30. mteb/descriptive_stats/Classification/IconclassClassification.json +96 -0
  31. mteb/descriptive_stats/Classification/OpenTenderClassification.json +222 -0
  32. mteb/descriptive_stats/Classification/VaccinChatNLClassification.json +1068 -0
  33. mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringP2P.json +45 -0
  34. mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringS2S.json +45 -0
  35. mteb/descriptive_stats/Clustering/IconclassClusteringS2S.json +48 -0
  36. mteb/descriptive_stats/Clustering/OpenTenderClusteringP2P.json +111 -0
  37. mteb/descriptive_stats/Clustering/OpenTenderClusteringS2S.json +111 -0
  38. mteb/descriptive_stats/Clustering/VABBClusteringP2P.json +60 -0
  39. mteb/descriptive_stats/Clustering/VABBClusteringS2S.json +60 -0
  40. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XFlickr30kCoT2IRetrieval.json +243 -153
  41. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XM3600T2IRetrieval.json +999 -629
  42. mteb/descriptive_stats/Image/Any2AnyRetrieval/OVENIT2TRetrieval.json +33 -17
  43. mteb/descriptive_stats/Image/DocumentUnderstanding/MIRACLVisionRetrieval.json +574 -0
  44. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  45. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  46. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  47. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  49. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  50. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  54. mteb/descriptive_stats/MultilabelClassification/CovidDisinformationNLMultiLabelClassification.json +84 -0
  55. mteb/descriptive_stats/MultilabelClassification/VABBMultiLabelClassification.json +156 -0
  56. mteb/descriptive_stats/PairClassification/SICKNLPairClassification.json +35 -0
  57. mteb/descriptive_stats/PairClassification/XLWICNLPairClassification.json +35 -0
  58. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  59. mteb/descriptive_stats/Retrieval/ClimateFEVERHardNegatives.v2.json +30 -0
  60. mteb/descriptive_stats/Retrieval/DBPediaHardNegatives.v2.json +30 -0
  61. mteb/descriptive_stats/Retrieval/DutchNewsArticlesRetrieval.json +30 -0
  62. mteb/descriptive_stats/Retrieval/FEVERHardNegatives.v2.json +30 -0
  63. mteb/descriptive_stats/Retrieval/HotpotQAHardNegatives.v2.json +30 -0
  64. mteb/descriptive_stats/Retrieval/LegalQANLRetrieval.json +30 -0
  65. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  66. mteb/descriptive_stats/Retrieval/OpenTenderRetrieval.json +30 -0
  67. mteb/descriptive_stats/Retrieval/QuoraRetrievalHardNegatives.v2.json +30 -0
  68. mteb/descriptive_stats/Retrieval/RiaNewsRetrievalHardNegatives.v2.json +30 -0
  69. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  71. mteb/descriptive_stats/Retrieval/VABBRetrieval.json +30 -0
  72. mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json +184 -0
  73. mteb/descriptive_stats/Retrieval/WinoGrande.json +14 -14
  74. mteb/descriptive_stats/Retrieval/bBSARDNLRetrieval.json +30 -0
  75. mteb/descriptive_stats/STS/SICK-NL-STS.json +28 -0
  76. mteb/evaluate.py +26 -6
  77. mteb/languages/check_language_code.py +11 -3
  78. mteb/languages/language_scripts.py +4 -0
  79. mteb/leaderboard/app.py +5 -3
  80. mteb/leaderboard/benchmark_selector.py +4 -2
  81. mteb/leaderboard/text_segments.py +1 -1
  82. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  83. mteb/models/instruct_wrapper.py +3 -0
  84. mteb/models/model_implementations/align_models.py +6 -0
  85. mteb/models/model_implementations/andersborges.py +51 -0
  86. mteb/models/model_implementations/ara_models.py +7 -0
  87. mteb/models/model_implementations/b1ade_models.py +1 -1
  88. mteb/models/model_implementations/bge_models.py +1 -3
  89. mteb/models/model_implementations/blip2_models.py +9 -0
  90. mteb/models/model_implementations/blip_models.py +19 -0
  91. mteb/models/model_implementations/bmretriever_models.py +1 -1
  92. mteb/models/model_implementations/cadet_models.py +8 -0
  93. mteb/models/model_implementations/cde_models.py +12 -0
  94. mteb/models/model_implementations/codefuse_models.py +15 -0
  95. mteb/models/model_implementations/codesage_models.py +12 -0
  96. mteb/models/model_implementations/cohere_models.py +1 -1
  97. mteb/models/model_implementations/colqwen_models.py +57 -0
  98. mteb/models/model_implementations/emillykkejensen_models.py +70 -0
  99. mteb/models/model_implementations/gme_v_models.py +2 -2
  100. mteb/models/model_implementations/ibm_granite_models.py +1 -1
  101. mteb/models/model_implementations/inf_models.py +3 -3
  102. mteb/models/model_implementations/jasper_models.py +253 -2
  103. mteb/models/model_implementations/jina_models.py +12 -2
  104. mteb/models/model_implementations/kalm_models.py +159 -25
  105. mteb/models/model_implementations/llm2vec_models.py +1 -1
  106. mteb/models/model_implementations/misc_models.py +8 -2
  107. mteb/models/model_implementations/moco_models.py +9 -0
  108. mteb/models/model_implementations/mxbai_models.py +1 -1
  109. mteb/models/model_implementations/openclip_models.py +16 -0
  110. mteb/models/model_implementations/piccolo_models.py +6 -0
  111. mteb/models/model_implementations/rasgaard_models.py +33 -0
  112. mteb/models/model_implementations/reasonir_model.py +1 -1
  113. mteb/models/model_implementations/salesforce_models.py +1 -1
  114. mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -1
  115. mteb/models/model_implementations/spartan8806_atles_champion.py +26 -0
  116. mteb/models/model_implementations/tarka_models.py +374 -0
  117. mteb/models/model_implementations/voyage_models.py +6 -7
  118. mteb/models/model_implementations/voyage_v.py +10 -9
  119. mteb/models/model_implementations/yuan_models.py +33 -0
  120. mteb/models/search_wrappers.py +6 -5
  121. mteb/results/task_result.py +19 -17
  122. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  123. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  124. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +1 -5
  125. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  126. mteb/tasks/classification/ara/ajgt.py +1 -2
  127. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  128. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  129. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  130. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  131. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  132. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  133. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  134. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  135. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  136. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  137. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  138. mteb/tasks/classification/dan/angry_tweets_classification.py +2 -3
  139. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  140. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  141. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  142. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  143. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  144. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  145. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  146. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  147. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  148. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  149. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  150. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  151. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  152. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  153. mteb/tasks/classification/eng/legal_bench_classification.py +15 -121
  154. mteb/tasks/classification/eng/news_classification.py +1 -2
  155. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  156. mteb/tasks/classification/eng/patent_classification.py +1 -2
  157. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  158. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  159. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  160. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  161. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  162. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  163. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  164. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  165. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  166. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  167. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  168. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  169. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  170. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  171. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  172. mteb/tasks/classification/est/estonian_valence.py +1 -2
  173. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  174. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  175. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  176. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  177. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  178. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  179. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +1 -2
  180. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  181. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  182. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  183. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  184. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  185. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  186. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  187. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  188. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  189. mteb/tasks/classification/kor/klue_tc.py +1 -2
  190. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  191. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  192. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  193. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  194. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  195. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  196. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  197. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  198. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  199. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  200. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  201. mteb/tasks/classification/mya/myanmar_news.py +2 -3
  202. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  203. mteb/tasks/classification/nld/__init__.py +16 -0
  204. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  205. mteb/tasks/classification/nld/dutch_cola_classification.py +41 -0
  206. mteb/tasks/classification/nld/dutch_government_bias_classification.py +40 -0
  207. mteb/tasks/classification/nld/dutch_news_articles_classification.py +33 -0
  208. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +39 -0
  209. mteb/tasks/classification/nld/iconclass_classification.py +44 -0
  210. mteb/tasks/classification/nld/open_tender_classification.py +41 -0
  211. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +49 -0
  212. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  213. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  214. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  215. mteb/tasks/classification/pol/polish_classification.py +3 -6
  216. mteb/tasks/classification/ron/moroco.py +1 -2
  217. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  218. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  219. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  220. mteb/tasks/classification/rus/headline_classification.py +1 -2
  221. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  222. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  223. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  224. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  225. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  226. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  227. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  228. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  229. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  230. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  231. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  232. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  233. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  234. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  235. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  236. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  237. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  238. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  239. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  240. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  241. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  242. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  243. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  244. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  245. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  246. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  247. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  248. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  249. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  250. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  251. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  252. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  253. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  254. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  255. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  256. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  257. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  258. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  259. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  260. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  261. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  262. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  263. mteb/tasks/clustering/__init__.py +1 -0
  264. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  265. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  266. mteb/tasks/clustering/nld/__init__.py +17 -0
  267. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +40 -0
  268. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +40 -0
  269. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +50 -0
  270. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +54 -0
  271. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +44 -0
  272. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +54 -0
  273. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +54 -0
  274. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  275. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  276. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  277. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  278. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  279. mteb/tasks/multilabel_classification/__init__.py +1 -0
  280. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  281. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  282. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  283. mteb/tasks/multilabel_classification/nld/__init__.py +9 -0
  284. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +91 -0
  285. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +47 -0
  286. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  287. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  288. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  289. mteb/tasks/pair_classification/__init__.py +1 -0
  290. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  291. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  292. mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py +9 -8
  293. mteb/tasks/pair_classification/nld/__init__.py +7 -0
  294. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +39 -0
  295. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +44 -0
  296. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  297. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  298. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  299. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  300. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  301. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  302. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  303. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  304. mteb/tasks/retrieval/code/code_rag.py +8 -8
  305. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  306. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  307. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  308. mteb/tasks/retrieval/eng/__init__.py +18 -4
  309. mteb/tasks/retrieval/eng/climate_fever_retrieval.py +68 -77
  310. mteb/tasks/retrieval/eng/dbpedia_retrieval.py +55 -50
  311. mteb/tasks/retrieval/eng/fever_retrieval.py +62 -67
  312. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +0 -4
  313. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +0 -4
  314. mteb/tasks/retrieval/eng/hotpot_qa_retrieval.py +57 -67
  315. mteb/tasks/retrieval/eng/legal_summarization_retrieval.py +1 -1
  316. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  317. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +0 -3
  318. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +0 -2
  319. mteb/tasks/retrieval/eng/oven_it2t_retrieval.py +1 -1
  320. mteb/tasks/retrieval/eng/quora_retrieval.py +51 -46
  321. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +0 -4
  322. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +0 -4
  323. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +0 -2
  324. mteb/tasks/retrieval/eng/wino_grande_retrieval.py +1 -1
  325. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  326. mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py +1 -1
  327. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  328. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  329. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +6 -5
  330. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +0 -2
  331. mteb/tasks/retrieval/multilingual/miracl_retrieval.py +1 -1
  332. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +2 -9
  333. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  334. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  335. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  336. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  337. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  338. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +0 -2
  339. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  340. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +0 -2
  341. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +6 -5
  342. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +3 -4
  343. mteb/tasks/retrieval/nld/__init__.py +18 -4
  344. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  345. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +44 -0
  346. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +33 -0
  347. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +42 -0
  348. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  349. mteb/tasks/retrieval/nld/open_tender_retrieval.py +41 -0
  350. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  351. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  352. mteb/tasks/retrieval/nld/vabb_retrieval.py +44 -0
  353. mteb/tasks/retrieval/nob/norquad.py +2 -2
  354. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  355. mteb/tasks/retrieval/rus/__init__.py +11 -2
  356. mteb/tasks/retrieval/rus/ria_news_retrieval.py +48 -44
  357. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  358. mteb/tasks/retrieval/tur/tur_hist_quad.py +2 -2
  359. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  360. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  361. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  362. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  363. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  364. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  365. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  366. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  367. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  368. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  369. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  370. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  371. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  372. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  373. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  374. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  375. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  376. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  377. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  378. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  379. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  380. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  381. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  382. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  383. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  384. mteb/tasks/sts/__init__.py +1 -0
  385. mteb/tasks/sts/nld/__init__.py +5 -0
  386. mteb/tasks/sts/nld/sick_nl_sts.py +42 -0
  387. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  388. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  389. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  390. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  391. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  392. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  393. mteb-2.1.19.dist-info/METADATA +253 -0
  394. {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/RECORD +398 -330
  395. mteb/descriptive_stats/Classification/PersianTextTone.json +0 -56
  396. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json +0 -37
  397. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json +0 -25
  398. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json +0 -25
  399. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json +0 -25
  400. mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json +0 -20
  401. mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json +0 -20
  402. mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json +0 -20
  403. mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json +0 -20
  404. mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json +0 -20
  405. mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json +0 -220
  406. mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json +0 -402
  407. mteb/descriptive_stats/Reranking/InstructIR.json +0 -31
  408. mteb-2.0.5.dist-info/METADATA +0 -455
  409. {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/WHEEL +0 -0
  410. {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/entry_points.txt +0 -0
  411. {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/licenses/LICENSE +0 -0
  412. {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,9 @@
1
- from mteb.benchmarks.benchmark import Benchmark, HUMEBenchmark, MIEBBenchmark
1
+ from mteb.benchmarks.benchmark import (
2
+ Benchmark,
3
+ HUMEBenchmark,
4
+ MIEBBenchmark,
5
+ VidoreBenchmark,
6
+ )
2
7
  from mteb.get_tasks import MTEBTasks, get_task, get_tasks
3
8
 
4
9
  MMTEB_CITATION = r"""@article{enevoldsen2025mmtebmassivemultilingualtext,
@@ -641,7 +646,7 @@ MTEB_KOR = Benchmark(
641
646
  icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg",
642
647
  tasks=get_tasks(
643
648
  languages=["kor"],
644
- tasks=[ # @KennethEnevoldsen: We could probably expand this to a more solid benchamrk, but for now I have left it as is.
649
+ tasks=[ # @KennethEnevoldsen: We could probably expand this to a more solid benchmark, but for now I have left it as is.
645
650
  # Classification
646
651
  "KLUE-TC",
647
652
  # Reranking
@@ -975,8 +980,6 @@ MTEB_INDIC = Benchmark(
975
980
  # Bitext
976
981
  "IN22ConvBitextMining",
977
982
  "IN22GenBitextMining",
978
- "IndicGenBenchFloresBitextMining",
979
- "LinceMTBitextMining",
980
983
  # clustering
981
984
  "SIB200ClusteringS2S",
982
985
  # classification
@@ -985,7 +988,6 @@ MTEB_INDIC = Benchmark(
985
988
  "HindiDiscourseClassification",
986
989
  "SentimentAnalysisHindi",
987
990
  "MalayalamNewsClassification",
988
- "IndicLangClassification",
989
991
  "MTOPIntentClassification",
990
992
  "MultiHateClassification",
991
993
  "TweetSentimentClassification",
@@ -1008,7 +1010,7 @@ MTEB_INDIC = Benchmark(
1008
1010
  # STS
1009
1011
  (get_task("IndicCrosslingualSTS"),)
1010
1012
  ),
1011
- description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.",
1013
+ description="A regional geopolitical text embedding benchmark targeting embedding performance on Indic languages.",
1012
1014
  reference=None,
1013
1015
  citation=MMTEB_CITATION,
1014
1016
  contacts=["KennethEnevoldsen", "isaac-chung"],
@@ -1016,7 +1018,7 @@ MTEB_INDIC = Benchmark(
1016
1018
 
1017
1019
 
1018
1020
  eu_languages = [
1019
- # official EU languages (56) - we could include the whole economic area e.g. Norway - additioanlly we could include minority languages (probably a good idea?)
1021
+ # official EU languages (56) - we could include the whole economic area e.g. Norway - additionally we could include minority languages (probably a good idea?)
1020
1022
  # germanic
1021
1023
  "dan",
1022
1024
  "eng",
@@ -1084,7 +1086,6 @@ MTEB_EU = Benchmark(
1084
1086
  "AmazonCounterfactualClassification",
1085
1087
  "MassiveScenarioClassification",
1086
1088
  "MultiHateClassification",
1087
- "NordicLangClassification",
1088
1089
  "ScalaClassification",
1089
1090
  "SwissJudgementClassification",
1090
1091
  "TweetSentimentClassification",
@@ -1142,7 +1143,7 @@ MTEB_EU = Benchmark(
1142
1143
  languages=eu_languages,
1143
1144
  exclusive_language_filter=True,
1144
1145
  ),
1145
- description="A regional geopolitical text embedding benchmark targetting embedding performance on European languages.",
1146
+ description="A regional geopolitical text embedding benchmark targeting embedding performance on European languages.",
1146
1147
  reference=None,
1147
1148
  citation=MMTEB_CITATION,
1148
1149
  contacts=["KennethEnevoldsen", "isaac-chung"],
@@ -1636,6 +1637,81 @@ BEIR_NL = Benchmark(
1636
1637
  """,
1637
1638
  )
1638
1639
 
1640
+ MTEB_NL = Benchmark(
1641
+ name="MTEB(nld, v1)",
1642
+ display_name="Dutch",
1643
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/nl.svg",
1644
+ tasks=MTEBTasks(
1645
+ get_tasks(
1646
+ languages=["nld"],
1647
+ exclusive_language_filter=True,
1648
+ tasks=[
1649
+ # Classification
1650
+ "DutchBookReviewSentimentClassification.v2",
1651
+ "MassiveIntentClassification",
1652
+ "MassiveScenarioClassification",
1653
+ "SIB200Classification",
1654
+ "MultiHateClassification",
1655
+ "VaccinChatNLClassification",
1656
+ "DutchColaClassification",
1657
+ "DutchGovernmentBiasClassification",
1658
+ "DutchSarcasticHeadlinesClassification",
1659
+ "DutchNewsArticlesClassification",
1660
+ "OpenTenderClassification",
1661
+ "IconclassClassification",
1662
+ # # PairClassification
1663
+ "SICKNLPairClassification",
1664
+ "XLWICNLPairClassification",
1665
+ # # MultiLabelClassification
1666
+ "CovidDisinformationNLMultiLabelClassification",
1667
+ "MultiEURLEXMultilabelClassification",
1668
+ "VABBMultiLabelClassification",
1669
+ # # Clustering
1670
+ "DutchNewsArticlesClusteringS2S",
1671
+ "DutchNewsArticlesClusteringP2P",
1672
+ "SIB200ClusteringS2S",
1673
+ "VABBClusteringS2S",
1674
+ "VABBClusteringP2P",
1675
+ "OpenTenderClusteringS2S",
1676
+ "OpenTenderClusteringP2P",
1677
+ "IconclassClusteringS2S",
1678
+ # # Reranking
1679
+ "WikipediaRerankingMultilingual",
1680
+ # # Retrieval
1681
+ "ArguAna-NL.v2",
1682
+ "SCIDOCS-NL.v2",
1683
+ "SciFact-NL.v2",
1684
+ "NFCorpus-NL.v2",
1685
+ "BelebeleRetrieval",
1686
+ "WebFAQRetrieval",
1687
+ "DutchNewsArticlesRetrieval",
1688
+ "bBSARDNLRetrieval",
1689
+ "LegalQANLRetrieval",
1690
+ "OpenTenderRetrieval",
1691
+ "VABBRetrieval",
1692
+ "WikipediaRetrievalMultilingual",
1693
+ # # STS
1694
+ "SICK-NL-STS",
1695
+ "STSBenchmarkMultilingualSTS",
1696
+ ],
1697
+ )
1698
+ ),
1699
+ description="MTEB-NL",
1700
+ reference="https://arxiv.org/abs/2509.12340",
1701
+ contacts=["nikolay-banar"],
1702
+ citation=r"""
1703
+ @misc{banar2025mtebnle5nlembeddingbenchmark,
1704
+ archiveprefix = {arXiv},
1705
+ author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
1706
+ eprint = {22509.12340},
1707
+ primaryclass = {cs.CL},
1708
+ title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
1709
+ url = {https://arxiv.org/abs/2509.12340},
1710
+ year = {2025},
1711
+ }
1712
+ """,
1713
+ )
1714
+
1639
1715
  MIEB_common_tasks = [
1640
1716
  # Image Classification
1641
1717
  "Birdsnap", # fine
@@ -1783,7 +1859,7 @@ MIEB_ENG = MIEBBenchmark(
1783
1859
  ),
1784
1860
  description="""MIEB(eng) is a comprehensive image embeddings benchmark, spanning 8 task types, covering 125 tasks.
1785
1861
  In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation,
1786
- document undestanding, visual STS, and CV-centric tasks.""",
1862
+ document understanding, visual STS, and CV-centric tasks.""",
1787
1863
  reference="https://arxiv.org/abs/2504.10471",
1788
1864
  contacts=["gowitheflow-1998", "isaac-chung"],
1789
1865
  citation=r"""
@@ -1817,7 +1893,7 @@ MIEB_MULTILINGUAL = MIEBBenchmark(
1817
1893
  ),
1818
1894
  description="""MIEB(Multilingual) is a comprehensive image embeddings benchmark, spanning 10 task types, covering 130 tasks and a total of 39 languages.
1819
1895
  In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation,
1820
- document undestanding, visual STS, and CV-centric tasks. This benchmark consists of MIEB(eng) + 3 multilingual retrieval
1896
+ document understanding, visual STS, and CV-centric tasks. This benchmark consists of MIEB(eng) + 3 multilingual retrieval
1821
1897
  datasets + the multilingual parts of VisualSTS-b and VisualSTS-16.""",
1822
1898
  reference="https://arxiv.org/abs/2504.10471",
1823
1899
  contacts=["gowitheflow-1998", "isaac-chung"],
@@ -2038,7 +2114,7 @@ BUILT_MTEB = Benchmark(
2038
2114
  "BuiltBenchReranking",
2039
2115
  ],
2040
2116
  ),
2041
- description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various dicsiplines such as architeture, engineering, constrcution, and operations management of the built environment.',
2117
+ description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various disciplines such as architecture, engineering, construction, and operations management of the built environment.',
2042
2118
  reference="https://arxiv.org/abs/2411.12056",
2043
2119
  citation=r"""
2044
2120
  @article{shahinmoghadam2024benchmarking,
@@ -2143,10 +2219,43 @@ VIDORE_V2 = Benchmark(
2143
2219
  """,
2144
2220
  )
2145
2221
 
2146
- VISUAL_DOCUMENT_RETRIEVAL = Benchmark(
2147
- name="VisualDocumentRetrieval",
2148
- display_name="Visual Document Retrieval",
2149
- icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-picture.svg",
2222
+ VIDORE_V3 = VidoreBenchmark(
2223
+ name="ViDoRe(v3)",
2224
+ display_name="ViDoRe V3",
2225
+ icon="https://cdn-uploads.huggingface.co/production/uploads/66e16a677c2eb2da5109fb5c/x99xqw__fl2UaPbiIdC_f.png",
2226
+ tasks=get_tasks(
2227
+ tasks=[
2228
+ "Vidore3FinanceEnRetrieval",
2229
+ "Vidore3IndustrialRetrieval",
2230
+ "Vidore3ComputerScienceRetrieval",
2231
+ "Vidore3PharmaceuticalsRetrieval",
2232
+ "Vidore3HrRetrieval",
2233
+ "Vidore3FinanceFrRetrieval",
2234
+ "Vidore3PhysicsRetrieval",
2235
+ "Vidore3EnergyRetrieval",
2236
+ "Vidore3TelecomRetrieval",
2237
+ "Vidore3NuclearRetrieval",
2238
+ ]
2239
+ ),
2240
+ description="ViDoRe V3 sets a new industry gold standard for multi-modal, enterprise document visual retrieval evaluation. It addresses a critical challenge in production RAG systems: retrieving accurate information from complex, visually-rich documents. The benchmark includes both open and closed datasets: to submit results on private tasks, please [open an issue](https://github.com/embeddings-benchmark/mteb/issues?template=eval_request.yaml).",
2241
+ reference="https://huggingface.co/blog/QuentinJG/introducing-vidore-v3",
2242
+ citation=r"""
2243
+ @misc{mace2025vidorev3,
2244
+ author = {Macé, Quentin and Loison, Antonio and EDY, Antoine and Xing, Victor and Viaud, Gautier},
2245
+ day = {5},
2246
+ howpublished = {\url{https://huggingface.co/blog/QuentinJG/introducing-vidore-v3}},
2247
+ journal = {Hugging Face Blog},
2248
+ month = {November},
2249
+ publisher = {Hugging Face},
2250
+ title = {ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-cases},
2251
+ year = {2025},
2252
+ }
2253
+ """,
2254
+ )
2255
+
2256
+ VISUAL_DOCUMENT_RETRIEVAL = VidoreBenchmark(
2257
+ name="ViDoRe(v1&v2)",
2258
+ display_name="ViDoRe (V1&V2)",
2150
2259
  tasks=get_tasks(
2151
2260
  tasks=[
2152
2261
  # v1
@@ -14,7 +14,7 @@ def _build_registry() -> dict[str, Benchmark]:
14
14
 
15
15
  benchmark_registry = {
16
16
  inst.name: inst
17
- for nam, inst in benchmark_module.__dict__.items()
17
+ for _, inst in benchmark_module.__dict__.items()
18
18
  if isinstance(inst, Benchmark)
19
19
  }
20
20
  return benchmark_registry
@@ -39,6 +39,7 @@ def _get_previous_benchmark_names() -> dict[str, str]:
39
39
  MTEB_RETRIEVAL_MEDICAL,
40
40
  MTEB_RETRIEVAL_WITH_INSTRUCTIONS,
41
41
  SEB,
42
+ VISUAL_DOCUMENT_RETRIEVAL,
42
43
  MTEB_code,
43
44
  MTEB_multilingual_v2,
44
45
  )
@@ -63,6 +64,7 @@ def _get_previous_benchmark_names() -> dict[str, str]:
63
64
  "MTEB(Chinese)": C_MTEB.name,
64
65
  "FaMTEB(fas, beta)": FA_MTEB.name,
65
66
  "BRIGHT(long)": BRIGHT_LONG.name,
67
+ "VisualDocumentRetrieval": VISUAL_DOCUMENT_RETRIEVAL.name,
66
68
  }
67
69
  return previous_benchmark_names
68
70
 
mteb/cache.py CHANGED
@@ -62,7 +62,11 @@ class ResultCache:
62
62
  Returns:
63
63
  The path to the results of the task.
64
64
  """
65
- results_folder = "results" if not remote else "remote"
65
+ results_folder = (
66
+ self.cache_path / "results"
67
+ if not remote
68
+ else self.cache_path / "remote" / "results"
69
+ )
66
70
 
67
71
  if isinstance(model_name, ModelMeta):
68
72
  if model_revision is not None:
@@ -74,7 +78,7 @@ class ResultCache:
74
78
  elif isinstance(model_name, str):
75
79
  model_name = model_name.replace("/", "__").replace(" ", "_")
76
80
 
77
- model_path = self.cache_path / results_folder / model_name
81
+ model_path = results_folder / model_name
78
82
 
79
83
  if model_revision is None:
80
84
  logger.warning(
@@ -495,7 +499,7 @@ class ResultCache:
495
499
  if validate_and_filter:
496
500
  task = task_names[task_result.task_name]
497
501
  try:
498
- task_result.validate_and_filter_scores(task=task)
502
+ task_result = task_result.validate_and_filter_scores(task=task)
499
503
  except Exception as e:
500
504
  logger.info(
501
505
  f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}"
@@ -0,0 +1,54 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 2400,
4
+ "number_texts_intersect_with_train": null,
5
+ "text_statistics": {
6
+ "total_text_length": 92146,
7
+ "min_text_length": 5,
8
+ "average_text_length": 38.39416666666666,
9
+ "max_text_length": 138,
10
+ "unique_texts": 2400
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 2,
18
+ "labels": {
19
+ "1": {
20
+ "count": 1200
21
+ },
22
+ "0": {
23
+ "count": 1200
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "train": {
29
+ "num_samples": 19893,
30
+ "number_texts_intersect_with_train": null,
31
+ "text_statistics": {
32
+ "total_text_length": 761416,
33
+ "min_text_length": 4,
34
+ "average_text_length": 38.27557432262605,
35
+ "max_text_length": 152,
36
+ "unique_texts": 19893
37
+ },
38
+ "image_statistics": null,
39
+ "label_statistics": {
40
+ "min_labels_per_text": 1,
41
+ "average_label_per_text": 1.0,
42
+ "max_labels_per_text": 1,
43
+ "unique_labels": 2,
44
+ "labels": {
45
+ "1": {
46
+ "count": 12604
47
+ },
48
+ "0": {
49
+ "count": 7289
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
@@ -0,0 +1,54 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 752,
4
+ "number_texts_intersect_with_train": 100,
5
+ "text_statistics": {
6
+ "total_text_length": 171956,
7
+ "min_text_length": 32,
8
+ "average_text_length": 228.66489361702128,
9
+ "max_text_length": 2746,
10
+ "unique_texts": 752
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 2,
18
+ "labels": {
19
+ "0.0": {
20
+ "count": 555
21
+ },
22
+ "1.0": {
23
+ "count": 197
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "train": {
29
+ "num_samples": 1718,
30
+ "number_texts_intersect_with_train": null,
31
+ "text_statistics": {
32
+ "total_text_length": 390362,
33
+ "min_text_length": 18,
34
+ "average_text_length": 227.2188591385332,
35
+ "max_text_length": 2662,
36
+ "unique_texts": 1718
37
+ },
38
+ "image_statistics": null,
39
+ "label_statistics": {
40
+ "min_labels_per_text": 1,
41
+ "average_label_per_text": 1.0,
42
+ "max_labels_per_text": 1,
43
+ "unique_labels": 2,
44
+ "labels": {
45
+ "1.0": {
46
+ "count": 470
47
+ },
48
+ "0.0": {
49
+ "count": 1248
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
@@ -0,0 +1,90 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1200,
4
+ "number_texts_intersect_with_train": 1,
5
+ "text_statistics": {
6
+ "total_text_length": 2034506,
7
+ "min_text_length": 184,
8
+ "average_text_length": 1695.4216666666666,
9
+ "max_text_length": 8825,
10
+ "unique_texts": 1200
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 8,
18
+ "labels": {
19
+ "Opmerkelijk": {
20
+ "count": 150
21
+ },
22
+ "Buitenland": {
23
+ "count": 150
24
+ },
25
+ "Cultuur & Media": {
26
+ "count": 150
27
+ },
28
+ "Binnenland": {
29
+ "count": 150
30
+ },
31
+ "Politiek": {
32
+ "count": 150
33
+ },
34
+ "Economie": {
35
+ "count": 150
36
+ },
37
+ "Tech": {
38
+ "count": 150
39
+ },
40
+ "Regionaal nieuws": {
41
+ "count": 150
42
+ }
43
+ }
44
+ }
45
+ },
46
+ "train": {
47
+ "num_samples": 5600,
48
+ "number_texts_intersect_with_train": null,
49
+ "text_statistics": {
50
+ "total_text_length": 9620538,
51
+ "min_text_length": 106,
52
+ "average_text_length": 1717.9532142857142,
53
+ "max_text_length": 29389,
54
+ "unique_texts": 5600
55
+ },
56
+ "image_statistics": null,
57
+ "label_statistics": {
58
+ "min_labels_per_text": 1,
59
+ "average_label_per_text": 1.0,
60
+ "max_labels_per_text": 1,
61
+ "unique_labels": 8,
62
+ "labels": {
63
+ "Cultuur & Media": {
64
+ "count": 700
65
+ },
66
+ "Binnenland": {
67
+ "count": 700
68
+ },
69
+ "Buitenland": {
70
+ "count": 700
71
+ },
72
+ "Regionaal nieuws": {
73
+ "count": 700
74
+ },
75
+ "Politiek": {
76
+ "count": 700
77
+ },
78
+ "Economie": {
79
+ "count": 700
80
+ },
81
+ "Opmerkelijk": {
82
+ "count": 700
83
+ },
84
+ "Tech": {
85
+ "count": 700
86
+ }
87
+ }
88
+ }
89
+ }
90
+ }
@@ -0,0 +1,54 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1326,
4
+ "number_texts_intersect_with_train": null,
5
+ "text_statistics": {
6
+ "total_text_length": 82644,
7
+ "min_text_length": 17,
8
+ "average_text_length": 62.32579185520362,
9
+ "max_text_length": 117,
10
+ "unique_texts": 1326
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 2,
18
+ "labels": {
19
+ "0": {
20
+ "count": 826
21
+ },
22
+ "1": {
23
+ "count": 500
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "train": {
29
+ "num_samples": 10609,
30
+ "number_texts_intersect_with_train": null,
31
+ "text_statistics": {
32
+ "total_text_length": 658787,
33
+ "min_text_length": 7,
34
+ "average_text_length": 62.09699311904986,
35
+ "max_text_length": 161,
36
+ "unique_texts": 10609
37
+ },
38
+ "image_statistics": null,
39
+ "label_statistics": {
40
+ "min_labels_per_text": 1,
41
+ "average_label_per_text": 1.0,
42
+ "max_labels_per_text": 1,
43
+ "unique_labels": 2,
44
+ "labels": {
45
+ "1": {
46
+ "count": 4000
47
+ },
48
+ "0": {
49
+ "count": 6609
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
@@ -0,0 +1,96 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 202,
4
+ "number_texts_intersect_with_train": null,
5
+ "text_statistics": {
6
+ "total_text_length": 11827,
7
+ "min_text_length": 6,
8
+ "average_text_length": 58.54950495049505,
9
+ "max_text_length": 403,
10
+ "unique_texts": 202
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 9,
18
+ "labels": {
19
+ "Geschiedenis": {
20
+ "count": 22
21
+ },
22
+ "Klassieke mythologie en Oude Geschiedenis": {
23
+ "count": 22
24
+ },
25
+ "Literatuur": {
26
+ "count": 23
27
+ },
28
+ "Natuur": {
29
+ "count": 23
30
+ },
31
+ "De mens, de mensheid in het algemeen": {
32
+ "count": 22
33
+ },
34
+ "Maatschappij, civilisatie en cultuur": {
35
+ "count": 22
36
+ },
37
+ "Abstracte idee\u00ebn en concepten": {
38
+ "count": 23
39
+ },
40
+ "Religie en magie": {
41
+ "count": 22
42
+ },
43
+ "Bijbel": {
44
+ "count": 23
45
+ }
46
+ }
47
+ }
48
+ },
49
+ "train": {
50
+ "num_samples": 945,
51
+ "number_texts_intersect_with_train": null,
52
+ "text_statistics": {
53
+ "total_text_length": 52510,
54
+ "min_text_length": 3,
55
+ "average_text_length": 55.56613756613756,
56
+ "max_text_length": 793,
57
+ "unique_texts": 945
58
+ },
59
+ "image_statistics": null,
60
+ "label_statistics": {
61
+ "min_labels_per_text": 1,
62
+ "average_label_per_text": 1.0,
63
+ "max_labels_per_text": 1,
64
+ "unique_labels": 9,
65
+ "labels": {
66
+ "Literatuur": {
67
+ "count": 105
68
+ },
69
+ "Maatschappij, civilisatie en cultuur": {
70
+ "count": 105
71
+ },
72
+ "Klassieke mythologie en Oude Geschiedenis": {
73
+ "count": 105
74
+ },
75
+ "Bijbel": {
76
+ "count": 105
77
+ },
78
+ "De mens, de mensheid in het algemeen": {
79
+ "count": 105
80
+ },
81
+ "Abstracte idee\u00ebn en concepten": {
82
+ "count": 105
83
+ },
84
+ "Natuur": {
85
+ "count": 105
86
+ },
87
+ "Geschiedenis": {
88
+ "count": 105
89
+ },
90
+ "Religie en magie": {
91
+ "count": 105
92
+ }
93
+ }
94
+ }
95
+ }
96
+ }