mteb 2.5.2__py3-none-any.whl → 2.7.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (529) hide show
  1. mteb/__init__.py +2 -0
  2. mteb/_create_dataloaders.py +78 -30
  3. mteb/_evaluators/any_sts_evaluator.py +13 -6
  4. mteb/_evaluators/clustering_evaluator.py +13 -5
  5. mteb/_evaluators/evaluator.py +12 -4
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +22 -11
  7. mteb/_evaluators/pair_classification_evaluator.py +17 -7
  8. mteb/_evaluators/retrieval_evaluator.py +23 -14
  9. mteb/_evaluators/retrieval_metrics.py +26 -19
  10. mteb/_evaluators/sklearn_evaluator.py +27 -17
  11. mteb/_evaluators/text/bitext_mining_evaluator.py +36 -20
  12. mteb/_evaluators/text/summarization_evaluator.py +31 -20
  13. mteb/_evaluators/zeroshot_classification_evaluator.py +16 -5
  14. mteb/_helpful_enum.py +5 -1
  15. mteb/abstasks/_data_filter/filters.py +9 -3
  16. mteb/abstasks/_data_filter/task_pipelines.py +10 -2
  17. mteb/abstasks/_statistics_calculation.py +21 -11
  18. mteb/abstasks/_stratification.py +18 -18
  19. mteb/abstasks/abstask.py +78 -44
  20. mteb/abstasks/aggregate_task_metadata.py +21 -18
  21. mteb/abstasks/aggregated_task.py +23 -35
  22. mteb/abstasks/classification.py +39 -18
  23. mteb/abstasks/clustering.py +37 -20
  24. mteb/abstasks/clustering_legacy.py +30 -16
  25. mteb/abstasks/image/image_text_pair_classification.py +26 -9
  26. mteb/abstasks/multilabel_classification.py +33 -21
  27. mteb/abstasks/pair_classification.py +44 -19
  28. mteb/abstasks/regression.py +18 -10
  29. mteb/abstasks/retrieval.py +82 -52
  30. mteb/abstasks/retrieval_dataset_loaders.py +50 -39
  31. mteb/abstasks/sts.py +34 -15
  32. mteb/abstasks/task_metadata.py +44 -37
  33. mteb/abstasks/text/bitext_mining.py +57 -35
  34. mteb/abstasks/text/reranking.py +10 -8
  35. mteb/abstasks/text/summarization.py +26 -10
  36. mteb/abstasks/zeroshot_classification.py +27 -9
  37. mteb/benchmarks/_create_table.py +13 -7
  38. mteb/benchmarks/benchmark.py +15 -3
  39. mteb/benchmarks/benchmarks/__init__.py +6 -0
  40. mteb/benchmarks/benchmarks/benchmarks.py +153 -13
  41. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  42. mteb/benchmarks/get_benchmark.py +14 -55
  43. mteb/cache.py +189 -31
  44. mteb/cli/_display_tasks.py +10 -4
  45. mteb/cli/build_cli.py +112 -13
  46. mteb/cli/generate_model_card.py +50 -23
  47. mteb/deprecated_evaluator.py +72 -54
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  49. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  50. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  52. mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
  53. mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
  54. mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
  55. mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
  56. mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
  57. mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
  58. mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
  59. mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
  60. mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
  61. mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
  62. mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
  63. mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
  64. mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
  65. mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
  66. mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
  67. mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
  68. mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
  69. mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
  70. mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
  71. mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
  72. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  73. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  74. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  75. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  81. mteb/evaluate.py +71 -47
  82. mteb/filter_tasks.py +36 -32
  83. mteb/get_tasks.py +37 -33
  84. mteb/languages/language_scripts.py +11 -4
  85. mteb/leaderboard/app.py +172 -37
  86. mteb/leaderboard/table.py +7 -2
  87. mteb/load_results.py +20 -14
  88. mteb/models/abs_encoder.py +30 -16
  89. mteb/models/cache_wrappers/cache_backend_protocol.py +7 -7
  90. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +10 -5
  91. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +13 -4
  92. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  93. mteb/models/cache_wrappers/cache_wrapper.py +16 -11
  94. mteb/models/get_model_meta.py +53 -9
  95. mteb/models/instruct_wrapper.py +41 -13
  96. mteb/models/model_implementations/align_models.py +11 -5
  97. mteb/models/model_implementations/amazon_models.py +1 -0
  98. mteb/models/model_implementations/andersborges.py +6 -4
  99. mteb/models/model_implementations/ara_models.py +2 -1
  100. mteb/models/model_implementations/arctic_models.py +16 -8
  101. mteb/models/model_implementations/b1ade_models.py +2 -1
  102. mteb/models/model_implementations/bedrock_models.py +20 -6
  103. mteb/models/model_implementations/bge_models.py +85 -22
  104. mteb/models/model_implementations/bica_model.py +4 -3
  105. mteb/models/model_implementations/blip2_models.py +13 -6
  106. mteb/models/model_implementations/blip_models.py +33 -20
  107. mteb/models/model_implementations/bm25.py +27 -17
  108. mteb/models/model_implementations/bmretriever_models.py +16 -6
  109. mteb/models/model_implementations/cadet_models.py +2 -1
  110. mteb/models/model_implementations/cde_models.py +22 -9
  111. mteb/models/model_implementations/clip_models.py +18 -10
  112. mteb/models/model_implementations/clips_models.py +6 -3
  113. mteb/models/model_implementations/codefuse_models.py +10 -5
  114. mteb/models/model_implementations/codesage_models.py +6 -3
  115. mteb/models/model_implementations/cohere_models.py +19 -9
  116. mteb/models/model_implementations/cohere_v.py +16 -6
  117. mteb/models/model_implementations/colpali_models.py +10 -6
  118. mteb/models/model_implementations/colqwen_models.py +24 -38
  119. mteb/models/model_implementations/colsmol_models.py +5 -3
  120. mteb/models/model_implementations/conan_models.py +12 -5
  121. mteb/models/model_implementations/dino_models.py +70 -46
  122. mteb/models/model_implementations/e5_instruct.py +27 -4
  123. mteb/models/model_implementations/e5_models.py +18 -9
  124. mteb/models/model_implementations/e5_v.py +16 -10
  125. mteb/models/model_implementations/eagerworks_models.py +12 -5
  126. mteb/models/model_implementations/emillykkejensen_models.py +9 -6
  127. mteb/models/model_implementations/en_code_retriever.py +2 -1
  128. mteb/models/model_implementations/euler_models.py +3 -2
  129. mteb/models/model_implementations/evaclip_models.py +13 -4
  130. mteb/models/model_implementations/fa_models.py +18 -9
  131. mteb/models/model_implementations/facebookai.py +16 -2
  132. mteb/models/model_implementations/geogpt_models.py +2 -1
  133. mteb/models/model_implementations/gme_v_models.py +13 -8
  134. mteb/models/model_implementations/google_models.py +16 -5
  135. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -6
  136. mteb/models/model_implementations/gritlm_models.py +5 -2
  137. mteb/models/model_implementations/gte_models.py +34 -13
  138. mteb/models/model_implementations/hinvec_models.py +7 -2
  139. mteb/models/model_implementations/human.py +1 -0
  140. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  141. mteb/models/model_implementations/inf_models.py +4 -2
  142. mteb/models/model_implementations/jasper_models.py +16 -7
  143. mteb/models/model_implementations/jina_clip.py +58 -14
  144. mteb/models/model_implementations/jina_models.py +35 -16
  145. mteb/models/model_implementations/kalm_models.py +24 -12
  146. mteb/models/model_implementations/kblab.py +13 -6
  147. mteb/models/model_implementations/kennethenevoldsen_models.py +6 -4
  148. mteb/models/model_implementations/kfst.py +2 -1
  149. mteb/models/model_implementations/kowshik24_models.py +2 -1
  150. mteb/models/model_implementations/lens_models.py +2 -0
  151. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  152. mteb/models/model_implementations/linq_models.py +8 -2
  153. mteb/models/model_implementations/listconranker.py +11 -5
  154. mteb/models/model_implementations/llm2clip_models.py +18 -10
  155. mteb/models/model_implementations/llm2vec_models.py +28 -14
  156. mteb/models/model_implementations/mcinext_models.py +12 -3
  157. mteb/models/model_implementations/mdbr_models.py +19 -3
  158. mteb/models/model_implementations/misc_models.py +131 -68
  159. mteb/models/model_implementations/mixedbread_ai_models.py +335 -0
  160. mteb/models/model_implementations/mme5_models.py +3 -2
  161. mteb/models/model_implementations/moco_models.py +15 -8
  162. mteb/models/model_implementations/mod_models.py +3 -2
  163. mteb/models/model_implementations/model2vec_models.py +37 -18
  164. mteb/models/model_implementations/moka_models.py +4 -1
  165. mteb/models/model_implementations/nbailab.py +6 -3
  166. mteb/models/model_implementations/no_instruct_sentence_models.py +15 -7
  167. mteb/models/model_implementations/nomic_models.py +47 -19
  168. mteb/models/model_implementations/nomic_models_vision.py +6 -4
  169. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +20 -8
  170. mteb/models/model_implementations/nvidia_models.py +165 -22
  171. mteb/models/model_implementations/octen_models.py +64 -3
  172. mteb/models/model_implementations/openai_models.py +14 -4
  173. mteb/models/model_implementations/openclip_models.py +30 -17
  174. mteb/models/model_implementations/opensearch_neural_sparse_models.py +20 -9
  175. mteb/models/model_implementations/ops_moa_models.py +10 -3
  176. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +2 -1
  177. mteb/models/model_implementations/pawan_models.py +2 -1
  178. mteb/models/model_implementations/piccolo_models.py +3 -1
  179. mteb/models/model_implementations/pixie_models.py +56 -0
  180. mteb/models/model_implementations/promptriever_models.py +20 -10
  181. mteb/models/model_implementations/pylate_models.py +41 -21
  182. mteb/models/model_implementations/qodo_models.py +4 -2
  183. mteb/models/model_implementations/qtack_models.py +2 -1
  184. mteb/models/model_implementations/qwen3_models.py +14 -4
  185. mteb/models/model_implementations/qzhou_models.py +4 -2
  186. mteb/models/model_implementations/random_baseline.py +7 -6
  187. mteb/models/model_implementations/rasgaard_models.py +3 -2
  188. mteb/models/model_implementations/reasonir_model.py +66 -1
  189. mteb/models/model_implementations/repllama_models.py +18 -9
  190. mteb/models/model_implementations/rerankers_custom.py +25 -10
  191. mteb/models/model_implementations/rerankers_monot5_based.py +41 -21
  192. mteb/models/model_implementations/richinfoai_models.py +2 -1
  193. mteb/models/model_implementations/ru_sentence_models.py +40 -20
  194. mteb/models/model_implementations/ruri_models.py +20 -10
  195. mteb/models/model_implementations/salesforce_models.py +13 -4
  196. mteb/models/model_implementations/samilpwc_models.py +2 -1
  197. mteb/models/model_implementations/sarashina_embedding_models.py +4 -2
  198. mteb/models/model_implementations/searchmap_models.py +2 -1
  199. mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
  200. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +119 -148
  201. mteb/models/model_implementations/seed_models.py +2 -1
  202. mteb/models/model_implementations/sentence_transformers_models.py +142 -22
  203. mteb/models/model_implementations/shuu_model.py +2 -1
  204. mteb/models/model_implementations/siglip_models.py +39 -24
  205. mteb/models/model_implementations/slm_models.py +419 -0
  206. mteb/models/model_implementations/sonar_models.py +2 -1
  207. mteb/models/model_implementations/spartan8806_atles_champion.py +2 -1
  208. mteb/models/model_implementations/stella_models.py +23 -4
  209. mteb/models/model_implementations/tarka_models.py +4 -2
  210. mteb/models/model_implementations/text2vec_models.py +12 -3
  211. mteb/models/model_implementations/ua_sentence_models.py +2 -1
  212. mteb/models/model_implementations/uae_models.py +17 -5
  213. mteb/models/model_implementations/vdr_models.py +9 -2
  214. mteb/models/model_implementations/vi_vn_models.py +12 -6
  215. mteb/models/model_implementations/vista_models.py +11 -4
  216. mteb/models/model_implementations/vlm2vec_models.py +14 -7
  217. mteb/models/model_implementations/voyage_models.py +136 -4
  218. mteb/models/model_implementations/voyage_v.py +17 -10
  219. mteb/models/model_implementations/xyz_models.py +1 -0
  220. mteb/models/model_implementations/youtu_models.py +2 -1
  221. mteb/models/model_implementations/yuan_models.py +2 -1
  222. mteb/models/model_implementations/yuan_models_en.py +3 -2
  223. mteb/models/model_meta.py +127 -40
  224. mteb/models/models_protocols.py +43 -22
  225. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  226. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +21 -10
  227. mteb/models/search_wrappers.py +63 -29
  228. mteb/models/sentence_transformer_wrapper.py +52 -26
  229. mteb/models/vllm_wrapper.py +329 -0
  230. mteb/py.typed +0 -0
  231. mteb/results/benchmark_results.py +48 -35
  232. mteb/results/model_result.py +68 -32
  233. mteb/results/task_result.py +110 -72
  234. mteb/similarity_functions.py +19 -9
  235. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  236. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  237. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  238. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  239. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  240. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  241. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  242. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  243. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  244. mteb/tasks/bitext_mining/eng/pub_chem_smiles_bitext_mining.py +1 -1
  245. mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py +3 -3
  246. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  247. mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py +1 -1
  248. mteb/tasks/bitext_mining/multilingual/in22_conv_bitext_mining.py +1 -1
  249. mteb/tasks/bitext_mining/multilingual/in22_gen_bitext_mining.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/norwegian_courts_bitext_mining.py +1 -1
  251. mteb/tasks/bitext_mining/multilingual/ntrex_bitext_mining.py +1 -1
  252. mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py +2 -2
  253. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -1
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -1
  256. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -1
  257. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -1
  258. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -1
  259. mteb/tasks/classification/bul/bulgarian_store_review_sentiment_classfication.py +1 -1
  260. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +2 -2
  261. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -1
  262. mteb/tasks/classification/dan/dk_hate_classification.py +2 -2
  263. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -1
  264. mteb/tasks/classification/ell/greek_legal_code_classification.py +1 -1
  265. mteb/tasks/classification/eng/dbpedia_classification.py +2 -2
  266. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -2
  267. mteb/tasks/classification/eng/toxic_conversations_classification.py +2 -2
  268. mteb/tasks/classification/eng/tweet_topic_single_classification.py +1 -1
  269. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -1
  270. mteb/tasks/classification/eng/yelp_review_full_classification.py +2 -2
  271. mteb/tasks/classification/est/estonian_valence.py +2 -2
  272. mteb/tasks/classification/fas/fa_mteb_classification.py +6 -6
  273. mteb/tasks/classification/fas/persian_food_sentiment_classification.py +1 -1
  274. mteb/tasks/classification/fil/filipino_shopee_reviews_classification.py +1 -1
  275. mteb/tasks/classification/fin/fin_toxicity_classification.py +1 -1
  276. mteb/tasks/classification/fra/french_book_reviews.py +2 -2
  277. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +2 -2
  278. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -1
  279. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -1
  280. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -1
  281. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +2 -2
  282. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -1
  283. mteb/tasks/classification/ita/dado_eval_coarse_classification.py +1 -1
  284. mteb/tasks/classification/ita/ita_casehold_classification.py +1 -1
  285. mteb/tasks/classification/ita/sardi_stance_classification.py +1 -1
  286. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -1
  287. mteb/tasks/classification/jpn/wrime_classification.py +1 -1
  288. mteb/tasks/classification/kan/kannada_news_classification.py +2 -2
  289. mteb/tasks/classification/kor/klue_tc.py +2 -2
  290. mteb/tasks/classification/kor/kor_fin.py +1 -1
  291. mteb/tasks/classification/kor/kor_hate_classification.py +1 -1
  292. mteb/tasks/classification/kor/kor_sarcasm_classification.py +1 -1
  293. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  294. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -1
  295. mteb/tasks/classification/mar/marathi_news_classification.py +1 -1
  296. mteb/tasks/classification/multilingual/afri_senti_lang_classification.py +1 -1
  297. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -1
  298. mteb/tasks/classification/multilingual/cyrillic_turkic_lang_classification.py +1 -1
  299. mteb/tasks/classification/multilingual/indic_nlp_news_classification.py +1 -1
  300. mteb/tasks/classification/multilingual/masakha_news_classification.py +1 -1
  301. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -1
  302. mteb/tasks/classification/multilingual/multilingual_sentiment_classification.py +1 -1
  303. mteb/tasks/classification/multilingual/scala_classification.py +2 -2
  304. mteb/tasks/classification/multilingual/sib200_classification.py +1 -1
  305. mteb/tasks/classification/multilingual/turkic_classification.py +1 -1
  306. mteb/tasks/classification/multilingual/tweet_sentiment_classification.py +1 -1
  307. mteb/tasks/classification/nep/nepali_news_classification.py +2 -2
  308. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +1 -1
  309. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +1 -1
  310. mteb/tasks/classification/ory/odia_news_classification.py +2 -2
  311. mteb/tasks/classification/pan/punjabi_news_classification.py +1 -1
  312. mteb/tasks/classification/ron/moroco.py +1 -1
  313. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -1
  314. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -1
  315. mteb/tasks/classification/rus/georeview_classification.py +1 -1
  316. mteb/tasks/classification/rus/headline_classification.py +2 -2
  317. mteb/tasks/classification/rus/inappropriateness_classification.py +2 -2
  318. mteb/tasks/classification/rus/ru_reviews_classification.py +2 -2
  319. mteb/tasks/classification/rus/ru_sci_bench_grnti_classification.py +1 -1
  320. mteb/tasks/classification/rus/ru_sci_bench_oecd_classification.py +1 -1
  321. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -1
  322. mteb/tasks/classification/san/sanskrit_shlokas_classification.py +1 -1
  323. mteb/tasks/classification/sin/sinhala_news_classification.py +2 -2
  324. mteb/tasks/classification/sin/sinhala_news_source_classification.py +2 -2
  325. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +2 -2
  326. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -1
  327. mteb/tasks/classification/spa/spanish_news_classification.py +2 -2
  328. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -1
  329. mteb/tasks/classification/tam/tamil_news_classification.py +2 -2
  330. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +2 -2
  331. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  332. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +2 -2
  333. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  334. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -1
  335. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -1
  336. mteb/tasks/classification/zho/yue_openrice_review_classification.py +2 -2
  337. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -1
  338. mteb/tasks/clustering/deu/blurbs_clustering_p2p.py +1 -1
  339. mteb/tasks/clustering/deu/blurbs_clustering_s2s.py +1 -1
  340. mteb/tasks/clustering/eng/arxiv_clustering_p2p.py +1 -1
  341. mteb/tasks/clustering/eng/arxiv_hierarchical_clustering.py +2 -2
  342. mteb/tasks/clustering/eng/big_patent_clustering.py +1 -1
  343. mteb/tasks/clustering/eng/biorxiv_clustering_p2p.py +1 -1
  344. mteb/tasks/clustering/eng/biorxiv_clustering_s2s.py +1 -1
  345. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  346. mteb/tasks/clustering/eng/medrxiv_clustering_p2p.py +1 -1
  347. mteb/tasks/clustering/eng/medrxiv_clustering_s2s.py +1 -1
  348. mteb/tasks/clustering/eng/reddit_clustering.py +1 -1
  349. mteb/tasks/clustering/eng/reddit_clustering_p2p.py +1 -1
  350. mteb/tasks/clustering/eng/stack_exchange_clustering.py +1 -1
  351. mteb/tasks/clustering/eng/stack_exchange_clustering_p2p.py +1 -1
  352. mteb/tasks/clustering/eng/twenty_newsgroups_clustering.py +1 -1
  353. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  354. mteb/tasks/clustering/fas/fa_mteb_clustering.py +4 -4
  355. mteb/tasks/clustering/fra/hal_clustering_s2s.py +2 -2
  356. mteb/tasks/clustering/multilingual/mlsum_clustering_p2p.py +2 -2
  357. mteb/tasks/clustering/multilingual/mlsum_clustering_s2s.py +2 -2
  358. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -1
  359. mteb/tasks/clustering/multilingual/wiki_clustering_p2p.py +1 -1
  360. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +1 -1
  361. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +1 -1
  362. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +1 -1
  363. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +1 -1
  364. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +1 -1
  365. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +1 -1
  366. mteb/tasks/clustering/nob/snl_clustering.py +8 -3
  367. mteb/tasks/clustering/nob/vg_clustering.py +8 -3
  368. mteb/tasks/clustering/pol/polish_clustering.py +3 -3
  369. mteb/tasks/clustering/rus/ru_sci_bench_grnti_clustering_p2p.py +1 -1
  370. mteb/tasks/clustering/rus/ru_sci_bench_oecd_clustering_p2p.py +1 -1
  371. mteb/tasks/clustering/zho/cmteb_clustering.py +6 -6
  372. mteb/tasks/image_text_pair_classification/eng/image_co_de.py +1 -1
  373. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +2 -2
  374. mteb/tasks/instruction_reranking/multilingual/m_follow_ir.py +2 -2
  375. mteb/tasks/multichoice/eng/cv_bench.py +4 -4
  376. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -1
  377. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -1
  378. mteb/tasks/multilabel_classification/rus/ru_toixic_multilabelclassification_okmlcup.py +1 -1
  379. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  380. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -1
  381. mteb/tasks/pair_classification/ara/ar_entail.py +1 -1
  382. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -1
  383. mteb/tasks/pair_classification/deu/false_friends_de_en_pc.py +1 -1
  384. mteb/tasks/pair_classification/eng/pub_chem_ai_sentence_paraphrase_pc.py +1 -1
  385. mteb/tasks/pair_classification/eng/pub_chem_smilespc.py +4 -3
  386. mteb/tasks/pair_classification/eng/pub_chem_synonym_pc.py +1 -1
  387. mteb/tasks/pair_classification/eng/pub_chem_wiki_paragraphs_pc.py +1 -1
  388. mteb/tasks/pair_classification/eng/sprint_duplicate_questions_pc.py +1 -1
  389. mteb/tasks/pair_classification/eng/twitter_sem_eval2015_pc.py +1 -1
  390. mteb/tasks/pair_classification/eng/twitter_url_corpus_pc.py +1 -1
  391. mteb/tasks/pair_classification/fas/fa_mteb_pair_classification.py +5 -5
  392. mteb/tasks/pair_classification/fas/fars_tail.py +2 -2
  393. mteb/tasks/pair_classification/hye/armenian_paraphrase_pc.py +1 -1
  394. mteb/tasks/pair_classification/ita/dis_co_tex_pair_classification.py +1 -1
  395. mteb/tasks/pair_classification/kor/klue_nli.py +1 -1
  396. mteb/tasks/pair_classification/multilingual/rte3.py +2 -2
  397. mteb/tasks/pair_classification/multilingual/xnli.py +1 -1
  398. mteb/tasks/pair_classification/pol/polish_pc.py +4 -4
  399. mteb/tasks/pair_classification/por/assin2_rte.py +1 -1
  400. mteb/tasks/pair_classification/por/sick_br_pc.py +1 -1
  401. mteb/tasks/pair_classification/rus/terra.py +2 -2
  402. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -1
  403. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -1
  404. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -1
  405. mteb/tasks/pair_classification/zho/cmteb_pair_classification.py +2 -2
  406. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  407. mteb/tasks/retrieval/ara/sadeem_question_retrieval.py +1 -1
  408. mteb/tasks/retrieval/code/code_edit_search_retrieval.py +1 -1
  409. mteb/tasks/retrieval/code/code_rag.py +16 -16
  410. mteb/tasks/retrieval/code/code_search_net_cc_retrieval.py +1 -1
  411. mteb/tasks/retrieval/code/coir_code_search_net_retrieval.py +1 -1
  412. mteb/tasks/retrieval/code/ds1000_retrieval.py +1 -1
  413. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +1 -1
  414. mteb/tasks/retrieval/code/human_eval_retrieval.py +1 -1
  415. mteb/tasks/retrieval/code/mbpp_retrieval.py +1 -1
  416. mteb/tasks/retrieval/code/wiki_sql_retrieval.py +1 -1
  417. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +2 -2
  418. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +3 -3
  419. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +3 -3
  420. mteb/tasks/retrieval/deu/german_gov_service_retrieval.py +1 -1
  421. mteb/tasks/retrieval/deu/german_qu_ad_retrieval.py +1 -1
  422. mteb/tasks/retrieval/ell/greek_civics_qa.py +1 -1
  423. mteb/tasks/retrieval/eng/__init__.py +44 -0
  424. mteb/tasks/retrieval/eng/bright_retrieval.py +10 -2
  425. mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
  426. mteb/tasks/retrieval/eng/chat_doctor_retrieval.py +1 -1
  427. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  428. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  429. mteb/tasks/retrieval/eng/fin_qa_retrieval.py +1 -1
  430. mteb/tasks/retrieval/eng/finance_bench_retrieval.py +1 -1
  431. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +1 -1
  432. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +1 -1
  433. mteb/tasks/retrieval/eng/hc3_finance_retrieval.py +1 -1
  434. mteb/tasks/retrieval/eng/lemb_narrative_qa_retrieval.py +1 -1
  435. mteb/tasks/retrieval/eng/lemb_needle_retrieval.py +1 -1
  436. mteb/tasks/retrieval/eng/lemb_passkey_retrieval.py +1 -1
  437. mteb/tasks/retrieval/eng/lemb_summ_screen_fd_retrieval.py +1 -1
  438. mteb/tasks/retrieval/eng/lemb_wikim_qa_retrieval.py +1 -1
  439. mteb/tasks/retrieval/eng/lembqm_sum_retrieval.py +1 -1
  440. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  441. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -1
  442. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +1 -1
  443. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/ml_questions.py +1 -1
  445. mteb/tasks/retrieval/eng/nano_argu_ana_retrieval.py +1 -1
  446. mteb/tasks/retrieval/eng/nano_climate_fever_retrieval.py +1 -1
  447. mteb/tasks/retrieval/eng/nano_db_pedia_retrieval.py +1 -1
  448. mteb/tasks/retrieval/eng/nano_fever_retrieval.py +1 -1
  449. mteb/tasks/retrieval/eng/nano_fi_qa2018_retrieval.py +1 -1
  450. mteb/tasks/retrieval/eng/nano_hotpot_qa_retrieval.py +1 -1
  451. mteb/tasks/retrieval/eng/nano_msmarco_retrieval.py +1 -1
  452. mteb/tasks/retrieval/eng/nano_nf_corpus_retrieval.py +1 -1
  453. mteb/tasks/retrieval/eng/nano_nq_retrieval.py +1 -1
  454. mteb/tasks/retrieval/eng/nano_quora_retrieval.py +1 -1
  455. mteb/tasks/retrieval/eng/nano_sci_fact_retrieval.py +1 -1
  456. mteb/tasks/retrieval/eng/nano_scidocs_retrieval.py +1 -1
  457. mteb/tasks/retrieval/eng/nano_touche2020_retrieval.py +1 -1
  458. mteb/tasks/retrieval/eng/narrative_qa_retrieval.py +1 -1
  459. mteb/tasks/retrieval/eng/r2_med_retrieval.py +8 -8
  460. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +1 -1
  461. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +1 -1
  462. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +10 -10
  463. mteb/tasks/retrieval/fra/f_qu_ad_retrieval.py +1 -1
  464. mteb/tasks/retrieval/fra/syntec_retrieval.py +1 -1
  465. mteb/tasks/retrieval/hun/hun_sum2.py +1 -1
  466. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +1 -1
  467. mteb/tasks/retrieval/kor/__init__.py +15 -1
  468. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  469. mteb/tasks/retrieval/multilingual/__init__.py +2 -0
  470. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt19.py +1 -1
  471. mteb/tasks/retrieval/multilingual/cross_lingual_semantic_discrimination_wmt21.py +1 -1
  472. mteb/tasks/retrieval/multilingual/cur_ev1_retrieval.py +1 -1
  473. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  474. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +1 -1
  475. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +1 -1
  476. mteb/tasks/retrieval/multilingual/mr_tidy_retrieval.py +1 -1
  477. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +1 -1
  478. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +5 -5
  479. mteb/tasks/retrieval/multilingual/statcan_dialogue_dataset_retrieval.py +1 -1
  480. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  481. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +14 -4
  482. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
  483. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +1 -1
  484. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +1 -1
  485. mteb/tasks/retrieval/multilingual/x_qu_ad_retrieval.py +1 -1
  486. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +1 -1
  487. mteb/tasks/retrieval/nld/cqa_dupstack_android_nl_retrieval.py +1 -1
  488. mteb/tasks/retrieval/nld/cqa_dupstack_english_nl_retrieval.py +1 -1
  489. mteb/tasks/retrieval/nld/cqa_dupstack_gaming_nl_retrieval.py +1 -1
  490. mteb/tasks/retrieval/nld/cqa_dupstack_gis_nl_retrieval.py +1 -1
  491. mteb/tasks/retrieval/nld/cqa_dupstack_mathematica_nl_retrieval.py +1 -1
  492. mteb/tasks/retrieval/nld/cqa_dupstack_physics_nl_retrieval.py +1 -1
  493. mteb/tasks/retrieval/nld/cqa_dupstack_programmers_nl_retrieval.py +1 -1
  494. mteb/tasks/retrieval/nld/cqa_dupstack_stats_nl_retrieval.py +1 -1
  495. mteb/tasks/retrieval/nld/cqa_dupstack_tex_nl_retrieval.py +1 -1
  496. mteb/tasks/retrieval/nld/cqa_dupstack_unix_nl_retrieval.py +1 -1
  497. mteb/tasks/retrieval/nld/cqa_dupstack_webmasters_nl_retrieval.py +1 -1
  498. mteb/tasks/retrieval/nld/cqa_dupstack_wordpress_nl_retrieval.py +1 -1
  499. mteb/tasks/retrieval/nob/norquad.py +3 -3
  500. mteb/tasks/retrieval/nob/snl_retrieval.py +3 -3
  501. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -1
  502. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  503. mteb/tasks/retrieval/vie/__init__.py +14 -6
  504. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  505. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  506. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  507. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  508. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  509. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  510. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  511. mteb/tasks/retrieval/vie/vie_qu_ad_retrieval.py +1 -1
  512. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  513. mteb/tasks/sts/fao/faroese_sts.py +1 -1
  514. mteb/tasks/sts/fra/sick_fr_sts.py +1 -1
  515. mteb/tasks/sts/kor/klue_sts.py +1 -1
  516. mteb/tasks/sts/por/sick_br_sts.py +1 -1
  517. mteb/tasks/sts/rus/ru_para_phraser_sts.py +1 -1
  518. mteb/tasks/zeroshot_classification/eng/sci_mmir.py +1 -1
  519. mteb/types/__init__.py +2 -0
  520. mteb/types/_encoder_io.py +13 -1
  521. mteb/types/_result.py +2 -1
  522. mteb/types/statistics.py +18 -5
  523. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/METADATA +15 -4
  524. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/RECORD +528 -486
  525. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/WHEEL +1 -1
  526. mteb/models/model_implementations/mxbai_models.py +0 -111
  527. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/entry_points.txt +0 -0
  528. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/licenses/LICENSE +0 -0
  529. {mteb-2.5.2.dist-info → mteb-2.7.9.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,7 @@ class Vidore3FinanceEnRetrieval(AbsTaskRetrieval):
15
15
  metadata = TaskMetadata(
16
16
  name="Vidore3FinanceEnRetrieval",
17
17
  description="Retrieve associated pages according to questions. This task, Finance - EN, is a corpus of reports from american banking companies, intended for long-document understanding tasks. Original queries were created in english, then translated to french, german, italian, portuguese and spanish.",
18
- reference="https://huggingface.co/blog/QuentinJG/introducing-vidore-v3",
18
+ reference="https://arxiv.org/abs/2601.08620",
19
19
  dataset={
20
20
  "path": "vidore/vidore_v3_finance_en_mteb_format",
21
21
  "revision": "fa78cb14152b3dde8c5defdc4e3ddf50de69dfeb",
@@ -34,15 +34,14 @@ class Vidore3FinanceEnRetrieval(AbsTaskRetrieval):
34
34
  modalities=["text", "image"],
35
35
  sample_creation="created and machine-translated",
36
36
  bibtex_citation=r"""
37
- @misc{mace2025vidorev3,
38
- author = {Macé, Quentin and Loison, Antonio and EDY, Antoine and Xing, Victor and Viaud, Gautier},
39
- day = {5},
40
- howpublished = {\url{https://huggingface.co/blog/QuentinJG/introducing-vidore-v3}},
41
- journal = {Hugging Face Blog},
42
- month = {November},
43
- publisher = {Hugging Face},
44
- title = {ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-cases},
45
- year = {2025},
37
+ @article{loison2026vidorev3comprehensiveevaluation,
38
+ archiveprefix = {arXiv},
39
+ author = {António Loison and Quentin Macé and Antoine Edy and Victor Xing and Tom Balough and Gabriel Moreira and Bo Liu and Manuel Faysse and Céline Hudelot and Gautier Viaud},
40
+ eprint = {2601.08620},
41
+ primaryclass = {cs.AI},
42
+ title = {ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
43
+ url = {https://arxiv.org/abs/2601.08620},
44
+ year = {2026},
46
45
  }
47
46
  """,
48
47
  prompt={"query": "Find a screenshot that is relevant to the user's question."},
@@ -53,7 +52,7 @@ class Vidore3FinanceFrRetrieval(AbsTaskRetrieval):
53
52
  metadata = TaskMetadata(
54
53
  name="Vidore3FinanceFrRetrieval",
55
54
  description="Retrieve associated pages according to questions. This task, Finance - FR, is a corpus of reports from french companies in the luxury domain, intended for long-document understanding tasks. Original queries were created in french, then translated to english, german, italian, portuguese and spanish.",
56
- reference="https://huggingface.co/blog/QuentinJG/introducing-vidore-v3",
55
+ reference="https://arxiv.org/abs/2601.08620",
57
56
  dataset={
58
57
  "path": "vidore/vidore_v3_finance_fr_mteb_format",
59
58
  "revision": "8a2adfda85a7967c7252129703d9b3c7c9f038a9",
@@ -71,15 +70,14 @@ class Vidore3FinanceFrRetrieval(AbsTaskRetrieval):
71
70
  dialect=[],
72
71
  sample_creation="created and machine-translated",
73
72
  bibtex_citation=r"""
74
- @misc{mace2025vidorev3,
75
- author = {Macé, Quentin and Loison, Antonio and EDY, Antoine and Xing, Victor and Viaud, Gautier},
76
- day = {5},
77
- howpublished = {\url{https://huggingface.co/blog/QuentinJG/introducing-vidore-v3}},
78
- journal = {Hugging Face Blog},
79
- month = {November},
80
- publisher = {Hugging Face},
81
- title = {ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-cases},
82
- year = {2025},
73
+ @article{loison2026vidorev3comprehensiveevaluation,
74
+ archiveprefix = {arXiv},
75
+ author = {António Loison and Quentin Macé and Antoine Edy and Victor Xing and Tom Balough and Gabriel Moreira and Bo Liu and Manuel Faysse and Céline Hudelot and Gautier Viaud},
76
+ eprint = {2601.08620},
77
+ primaryclass = {cs.AI},
78
+ title = {ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
79
+ url = {https://arxiv.org/abs/2601.08620},
80
+ year = {2026},
83
81
  }
84
82
  """,
85
83
  prompt={"query": "Find a screenshot that is relevant to the user's question."},
@@ -91,7 +89,7 @@ class Vidore3IndustrialRetrieval(AbsTaskRetrieval):
91
89
  metadata = TaskMetadata(
92
90
  name="Vidore3IndustrialRetrieval",
93
91
  description="Retrieve associated pages according to questions. This dataset, Industrial reports, is a corpus of technical documents on military aircraft (fueling, mechanics...), intended for complex-document understanding tasks. Original queries were created in english, then translated to french, german, italian, portuguese and spanish.",
94
- reference="https://huggingface.co/blog/QuentinJG/introducing-vidore-v3",
92
+ reference="https://arxiv.org/abs/2601.08620",
95
93
  dataset={
96
94
  "path": "vidore/vidore_v3_industrial_mteb_format",
97
95
  "revision": "f732b725cf4a70803210edfe265a04f8bd5328f6",
@@ -110,15 +108,14 @@ class Vidore3IndustrialRetrieval(AbsTaskRetrieval):
110
108
  modalities=["text", "image"],
111
109
  sample_creation="created and machine-translated",
112
110
  bibtex_citation=r"""
113
- @misc{mace2025vidorev3,
114
- author = {Macé, Quentin and Loison, Antonio and EDY, Antoine and Xing, Victor and Viaud, Gautier},
115
- day = {5},
116
- howpublished = {\url{https://huggingface.co/blog/QuentinJG/introducing-vidore-v3}},
117
- journal = {Hugging Face Blog},
118
- month = {November},
119
- publisher = {Hugging Face},
120
- title = {ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-cases},
121
- year = {2025},
111
+ @article{loison2026vidorev3comprehensiveevaluation,
112
+ archiveprefix = {arXiv},
113
+ author = {António Loison and Quentin Macé and Antoine Edy and Victor Xing and Tom Balough and Gabriel Moreira and Bo Liu and Manuel Faysse and Céline Hudelot and Gautier Viaud},
114
+ eprint = {2601.08620},
115
+ primaryclass = {cs.AI},
116
+ title = {ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
117
+ url = {https://arxiv.org/abs/2601.08620},
118
+ year = {2026},
122
119
  }
123
120
  """,
124
121
  prompt={"query": "Find a screenshot that is relevant to the user's question."},
@@ -130,7 +127,7 @@ class Vidore3PharmaceuticalsRetrieval(AbsTaskRetrieval):
130
127
  metadata = TaskMetadata(
131
128
  name="Vidore3PharmaceuticalsRetrieval",
132
129
  description="Retrieve associated pages according to questions. This dataset, Pharmaceutical, is a corpus of slides from the FDA, intended for long-document understanding tasks. Original queries were created in english, then translated to french, german, italian, portuguese and spanish.",
133
- reference="https://huggingface.co/blog/QuentinJG/introducing-vidore-v3",
130
+ reference="https://arxiv.org/abs/2601.08620",
134
131
  dataset={
135
132
  "path": "vidore/vidore_v3_pharmaceuticals_mteb_format",
136
133
  "revision": "237ed4f43c7fb3c4df07ec4e9dd0a4366be555b0",
@@ -149,15 +146,14 @@ class Vidore3PharmaceuticalsRetrieval(AbsTaskRetrieval):
149
146
  modalities=["text", "image"],
150
147
  sample_creation="created and machine-translated",
151
148
  bibtex_citation=r"""
152
- @misc{mace2025vidorev3,
153
- author = {Macé, Quentin and Loison, Antonio and EDY, Antoine and Xing, Victor and Viaud, Gautier},
154
- day = {5},
155
- howpublished = {\url{https://huggingface.co/blog/QuentinJG/introducing-vidore-v3}},
156
- journal = {Hugging Face Blog},
157
- month = {November},
158
- publisher = {Hugging Face},
159
- title = {ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-cases},
160
- year = {2025},
149
+ @article{loison2026vidorev3comprehensiveevaluation,
150
+ archiveprefix = {arXiv},
151
+ author = {António Loison and Quentin Macé and Antoine Edy and Victor Xing and Tom Balough and Gabriel Moreira and Bo Liu and Manuel Faysse and Céline Hudelot and Gautier Viaud},
152
+ eprint = {2601.08620},
153
+ primaryclass = {cs.AI},
154
+ title = {ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
155
+ url = {https://arxiv.org/abs/2601.08620},
156
+ year = {2026},
161
157
  }
162
158
  """,
163
159
  prompt={"query": "Find a screenshot that is relevant to the user's question."},
@@ -169,7 +165,7 @@ class Vidore3ComputerScienceRetrieval(AbsTaskRetrieval):
169
165
  metadata = TaskMetadata(
170
166
  name="Vidore3ComputerScienceRetrieval",
171
167
  description="Retrieve associated pages according to questions. This dataset, Computer Science, is a corpus of textbooks from the openstacks website, intended for long-document understanding tasks. Original queries were created in english, then translated to french, german, italian, portuguese and spanish.",
172
- reference="https://huggingface.co/blog/QuentinJG/introducing-vidore-v3",
168
+ reference="https://arxiv.org/abs/2601.08620",
173
169
  dataset={
174
170
  "path": "vidore/vidore_v3_computer_science_mteb_format",
175
171
  "revision": "fb7fb69f81f7db62790f40494124b8ad22b424ab",
@@ -188,15 +184,14 @@ class Vidore3ComputerScienceRetrieval(AbsTaskRetrieval):
188
184
  modalities=["text", "image"],
189
185
  sample_creation="created and machine-translated",
190
186
  bibtex_citation=r"""
191
- @misc{mace2025vidorev3,
192
- author = {Macé, Quentin and Loison, Antonio and EDY, Antoine and Xing, Victor and Viaud, Gautier},
193
- day = {5},
194
- howpublished = {\url{https://huggingface.co/blog/QuentinJG/introducing-vidore-v3}},
195
- journal = {Hugging Face Blog},
196
- month = {November},
197
- publisher = {Hugging Face},
198
- title = {ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-cases},
199
- year = {2025},
187
+ @article{loison2026vidorev3comprehensiveevaluation,
188
+ archiveprefix = {arXiv},
189
+ author = {António Loison and Quentin Macé and Antoine Edy and Victor Xing and Tom Balough and Gabriel Moreira and Bo Liu and Manuel Faysse and Céline Hudelot and Gautier Viaud},
190
+ eprint = {2601.08620},
191
+ primaryclass = {cs.AI},
192
+ title = {ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
193
+ url = {https://arxiv.org/abs/2601.08620},
194
+ year = {2026},
200
195
  }
201
196
  """,
202
197
  prompt={"query": "Find a screenshot that is relevant to the user's question."},
@@ -208,7 +203,7 @@ class Vidore3HrRetrieval(AbsTaskRetrieval):
208
203
  metadata = TaskMetadata(
209
204
  name="Vidore3HrRetrieval",
210
205
  description="Retrieve associated pages according to questions. This dataset, HR, is a corpus of reports released by the european union, intended for complex-document understanding tasks. Original queries were created in english, then translated to french, german, italian, portuguese and spanish.",
211
- reference="https://huggingface.co/blog/QuentinJG/introducing-vidore-v3",
206
+ reference="https://arxiv.org/abs/2601.08620",
212
207
  dataset={
213
208
  "path": "vidore/vidore_v3_hr_mteb_format",
214
209
  "revision": "bc7d43d64815ed30f664168c8052106484aba7fd",
@@ -227,15 +222,14 @@ class Vidore3HrRetrieval(AbsTaskRetrieval):
227
222
  modalities=["text", "image"],
228
223
  sample_creation="created and machine-translated",
229
224
  bibtex_citation=r"""
230
- @misc{mace2025vidorev3,
231
- author = {Macé, Quentin and Loison, Antonio and EDY, Antoine and Xing, Victor and Viaud, Gautier},
232
- day = {5},
233
- howpublished = {\url{https://huggingface.co/blog/QuentinJG/introducing-vidore-v3}},
234
- journal = {Hugging Face Blog},
235
- month = {November},
236
- publisher = {Hugging Face},
237
- title = {ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-cases},
238
- year = {2025},
225
+ @article{loison2026vidorev3comprehensiveevaluation,
226
+ archiveprefix = {arXiv},
227
+ author = {António Loison and Quentin Macé and Antoine Edy and Victor Xing and Tom Balough and Gabriel Moreira and Bo Liu and Manuel Faysse and Céline Hudelot and Gautier Viaud},
228
+ eprint = {2601.08620},
229
+ primaryclass = {cs.AI},
230
+ title = {ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
231
+ url = {https://arxiv.org/abs/2601.08620},
232
+ year = {2026},
239
233
  }
240
234
  """,
241
235
  prompt={"query": "Find a screenshot that is relevant to the user's question."},
@@ -247,7 +241,7 @@ class Vidore3EnergyRetrieval(AbsTaskRetrieval):
247
241
  metadata = TaskMetadata(
248
242
  name="Vidore3EnergyRetrieval",
249
243
  description="Retrieve associated pages according to questions. This dataset, Energy Fr, is a corpus of reports on energy supply in europe, intended for complex-document understanding tasks. Original queries were created in french, then translated to english, german, italian, portuguese and spanish.",
250
- reference="https://huggingface.co/blog/QuentinJG/introducing-vidore-v3",
244
+ reference="https://arxiv.org/abs/2601.08620",
251
245
  dataset={
252
246
  "path": "vidore/vidore_v3_energy_mteb_format",
253
247
  "revision": "84fca99e5978604bae30f2436eacb6dbaa0532e9",
@@ -266,15 +260,14 @@ class Vidore3EnergyRetrieval(AbsTaskRetrieval):
266
260
  modalities=["text", "image"],
267
261
  sample_creation="created and machine-translated",
268
262
  bibtex_citation=r"""
269
- @misc{mace2025vidorev3,
270
- author = {Macé, Quentin and Loison, Antonio and EDY, Antoine and Xing, Victor and Viaud, Gautier},
271
- day = {5},
272
- howpublished = {\url{https://huggingface.co/blog/QuentinJG/introducing-vidore-v3}},
273
- journal = {Hugging Face Blog},
274
- month = {November},
275
- publisher = {Hugging Face},
276
- title = {ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-cases},
277
- year = {2025},
263
+ @article{loison2026vidorev3comprehensiveevaluation,
264
+ archiveprefix = {arXiv},
265
+ author = {António Loison and Quentin Macé and Antoine Edy and Victor Xing and Tom Balough and Gabriel Moreira and Bo Liu and Manuel Faysse and Céline Hudelot and Gautier Viaud},
266
+ eprint = {2601.08620},
267
+ primaryclass = {cs.AI},
268
+ title = {ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
269
+ url = {https://arxiv.org/abs/2601.08620},
270
+ year = {2026},
278
271
  }
279
272
  """,
280
273
  prompt={"query": "Find a screenshot that is relevant to the user's question."},
@@ -286,7 +279,7 @@ class Vidore3PhysicsRetrieval(AbsTaskRetrieval):
286
279
  metadata = TaskMetadata(
287
280
  name="Vidore3PhysicsRetrieval",
288
281
  description="Retrieve associated pages according to questions. This dataset, Physics, is a corpus of course slides on french bachelor level physics lectures, intended for complex visual understanding tasks. Original queries were created in french, then translated to english, german, italian, portuguese and spanish.",
289
- reference="https://huggingface.co/blog/QuentinJG/introducing-vidore-v3",
282
+ reference="https://arxiv.org/abs/2601.08620",
290
283
  dataset={
291
284
  "path": "vidore/vidore_v3_physics_mteb_format",
292
285
  "revision": "2c18ef90ab3ef93a9d86ecc6521cdae2a29f8300",
@@ -305,15 +298,14 @@ class Vidore3PhysicsRetrieval(AbsTaskRetrieval):
305
298
  modalities=["text", "image"],
306
299
  sample_creation="created and machine-translated",
307
300
  bibtex_citation=r"""
308
- @misc{mace2025vidorev3,
309
- author = {Macé, Quentin and Loison, Antonio and EDY, Antoine and Xing, Victor and Viaud, Gautier},
310
- day = {5},
311
- howpublished = {\url{https://huggingface.co/blog/QuentinJG/introducing-vidore-v3}},
312
- journal = {Hugging Face Blog},
313
- month = {November},
314
- publisher = {Hugging Face},
315
- title = {ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-cases},
316
- year = {2025},
301
+ @article{loison2026vidorev3comprehensiveevaluation,
302
+ archiveprefix = {arXiv},
303
+ author = {António Loison and Quentin Macé and Antoine Edy and Victor Xing and Tom Balough and Gabriel Moreira and Bo Liu and Manuel Faysse and Céline Hudelot and Gautier Viaud},
304
+ eprint = {2601.08620},
305
+ primaryclass = {cs.AI},
306
+ title = {ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
307
+ url = {https://arxiv.org/abs/2601.08620},
308
+ year = {2026},
317
309
  }
318
310
  """,
319
311
  prompt={"query": "Find a screenshot that is relevant to the user's question."},
@@ -325,7 +317,7 @@ class Vidore3NuclearRetrieval(AbsTaskRetrieval):
325
317
  metadata = TaskMetadata(
326
318
  name="Vidore3NuclearRetrieval",
327
319
  description="Retrieve associated pages according to questions.",
328
- reference="https://huggingface.co/blog/QuentinJG/introducing-vidore-v3",
320
+ reference="https://arxiv.org/abs/2601.08620",
329
321
  dataset={
330
322
  "path": "mteb-private/Vidore3NuclearRetrieval",
331
323
  "revision": "a463fc67fefc01152153101e88a32d5f9515e3e3",
@@ -344,15 +336,14 @@ class Vidore3NuclearRetrieval(AbsTaskRetrieval):
344
336
  modalities=["text", "image"],
345
337
  sample_creation="created and machine-translated",
346
338
  bibtex_citation=r"""
347
- @misc{mace2025vidorev3,
348
- author = {Macé, Quentin and Loison, Antonio and EDY, Antoine and Xing, Victor and Viaud, Gautier},
349
- day = {5},
350
- howpublished = {\url{https://huggingface.co/blog/QuentinJG/introducing-vidore-v3}},
351
- journal = {Hugging Face Blog},
352
- month = {November},
353
- publisher = {Hugging Face},
354
- title = {ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-cases},
355
- year = {2025},
339
+ @article{loison2026vidorev3comprehensiveevaluation,
340
+ archiveprefix = {arXiv},
341
+ author = {António Loison and Quentin Macé and Antoine Edy and Victor Xing and Tom Balough and Gabriel Moreira and Bo Liu and Manuel Faysse and Céline Hudelot and Gautier Viaud},
342
+ eprint = {2601.08620},
343
+ primaryclass = {cs.AI},
344
+ title = {ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
345
+ url = {https://arxiv.org/abs/2601.08620},
346
+ year = {2026},
356
347
  }
357
348
  """,
358
349
  prompt={"query": "Find a screenshot that is relevant to the user's question."},
@@ -364,7 +355,7 @@ class Vidore3TelecomRetrieval(AbsTaskRetrieval):
364
355
  metadata = TaskMetadata(
365
356
  name="Vidore3TelecomRetrieval",
366
357
  description="Retrieve associated pages according to questions.",
367
- reference="https://huggingface.co/blog/QuentinJG/introducing-vidore-v3",
358
+ reference="https://arxiv.org/abs/2601.08620",
368
359
  dataset={
369
360
  "path": "mteb-private/Vidore3TelecomRetrieval",
370
361
  "revision": "a54635a274ef2835721b7cbe3eb27483b9ec964b",
@@ -383,15 +374,14 @@ class Vidore3TelecomRetrieval(AbsTaskRetrieval):
383
374
  modalities=["text", "image"],
384
375
  sample_creation="created and machine-translated",
385
376
  bibtex_citation=r"""
386
- @misc{mace2025vidorev3,
387
- author = {Macé, Quentin and Loison, Antonio and EDY, Antoine and Xing, Victor and Viaud, Gautier},
388
- day = {5},
389
- howpublished = {\url{https://huggingface.co/blog/QuentinJG/introducing-vidore-v3}},
390
- journal = {Hugging Face Blog},
391
- month = {November},
392
- publisher = {Hugging Face},
393
- title = {ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-cases},
394
- year = {2025},
377
+ @article{loison2026vidorev3comprehensiveevaluation,
378
+ archiveprefix = {arXiv},
379
+ author = {António Loison and Quentin Macé and Antoine Edy and Victor Xing and Tom Balough and Gabriel Moreira and Bo Liu and Manuel Faysse and Céline Hudelot and Gautier Viaud},
380
+ eprint = {2601.08620},
381
+ primaryclass = {cs.AI},
382
+ title = {ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
383
+ url = {https://arxiv.org/abs/2601.08620},
384
+ year = {2026},
395
385
  }
396
386
  """,
397
387
  prompt={"query": "Find a screenshot that is relevant to the user's question."},
@@ -116,7 +116,7 @@ class WITT2IRetrieval(AbsTaskRetrieval):
116
116
  """,
117
117
  )
118
118
 
119
- def load_data(self) -> None:
119
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
120
120
  if self.data_loaded:
121
121
  return
122
122
 
@@ -104,7 +104,7 @@ class XFlickr30kCoT2IRetrieval(AbsTaskRetrieval):
104
104
  """,
105
105
  )
106
106
 
107
- def load_data(self) -> None:
107
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
108
108
  if self.data_loaded:
109
109
  return
110
110
 
@@ -64,7 +64,7 @@ class XQuADRetrieval(AbsTaskRetrieval):
64
64
  """,
65
65
  )
66
66
 
67
- def load_data(self) -> None:
67
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
68
68
  if self.data_loaded:
69
69
  return
70
70
 
@@ -146,7 +146,7 @@ class XM3600T2IRetrieval(AbsTaskRetrieval):
146
146
  """,
147
147
  )
148
148
 
149
- def load_data(self) -> None:
149
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
150
150
  if self.data_loaded:
151
151
  return
152
152
 
@@ -42,7 +42,7 @@ class CQADupstackAndroidNLRetrieval(AbsTaskRetrieval):
42
42
  adapted_from=["CQADupstackAndroid"],
43
43
  )
44
44
 
45
- def load_data(self) -> None:
45
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
46
46
  if self.data_loaded:
47
47
  return
48
48
 
@@ -42,7 +42,7 @@ class CQADupstackEnglishNLRetrieval(AbsTaskRetrieval):
42
42
  adapted_from=["CQADupstackEnglish"],
43
43
  )
44
44
 
45
- def load_data(self) -> None:
45
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
46
46
  if self.data_loaded:
47
47
  return
48
48
 
@@ -42,7 +42,7 @@ class CQADupstackGamingNLRetrieval(AbsTaskRetrieval):
42
42
  adapted_from=["CQADupstackGamingRetrieval"],
43
43
  )
44
44
 
45
- def load_data(self) -> None:
45
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
46
46
  if self.data_loaded:
47
47
  return
48
48
 
@@ -42,7 +42,7 @@ class CQADupstackGisNLRetrieval(AbsTaskRetrieval):
42
42
  adapted_from=["CQADupstackGisRetrieval"],
43
43
  )
44
44
 
45
- def load_data(self) -> None:
45
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
46
46
  if self.data_loaded:
47
47
  return
48
48
 
@@ -42,7 +42,7 @@ class CQADupstackMathematicaNLRetrieval(AbsTaskRetrieval):
42
42
  adapted_from=["CQADupstackMathematicaRetrieval"],
43
43
  )
44
44
 
45
- def load_data(self) -> None:
45
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
46
46
  if self.data_loaded:
47
47
  return
48
48
 
@@ -42,7 +42,7 @@ class CQADupstackPhysicsNLRetrieval(AbsTaskRetrieval):
42
42
  adapted_from=["CQADupstackPhysicsRetrieval"],
43
43
  )
44
44
 
45
- def load_data(self) -> None:
45
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
46
46
  if self.data_loaded:
47
47
  return
48
48
 
@@ -42,7 +42,7 @@ class CQADupstackProgrammersNLRetrieval(AbsTaskRetrieval):
42
42
  adapted_from=["CQADupstackProgrammersRetrieval"],
43
43
  )
44
44
 
45
- def load_data(self) -> None:
45
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
46
46
  if self.data_loaded:
47
47
  return
48
48
 
@@ -42,7 +42,7 @@ class CQADupstackStatsNLRetrieval(AbsTaskRetrieval):
42
42
  adapted_from=["CQADupstackStatsRetrieval"],
43
43
  )
44
44
 
45
- def load_data(self) -> None:
45
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
46
46
  if self.data_loaded:
47
47
  return
48
48
 
@@ -42,7 +42,7 @@ class CQADupstackTexNLRetrieval(AbsTaskRetrieval):
42
42
  adapted_from=["CQADupstackTexRetrieval"],
43
43
  )
44
44
 
45
- def load_data(self) -> None:
45
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
46
46
  if self.data_loaded:
47
47
  return
48
48
 
@@ -42,7 +42,7 @@ class CQADupstackUnixNLRetrieval(AbsTaskRetrieval):
42
42
  adapted_from=["CQADupstackUnixRetrieval"],
43
43
  )
44
44
 
45
- def load_data(self) -> None:
45
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
46
46
  if self.data_loaded:
47
47
  return
48
48
 
@@ -42,7 +42,7 @@ class CQADupstackWebmastersNLRetrieval(AbsTaskRetrieval):
42
42
  adapted_from=["CQADupstackWebmastersRetrieval"],
43
43
  )
44
44
 
45
- def load_data(self) -> None:
45
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
46
46
  if self.data_loaded:
47
47
  return
48
48
 
@@ -42,7 +42,7 @@ class CQADupstackWordpressNLRetrieval(AbsTaskRetrieval):
42
42
  adapted_from=["CQADupstackWordpressRetrieval"],
43
43
  )
44
44
 
45
- def load_data(self) -> None:
45
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
46
46
  if self.data_loaded:
47
47
  return
48
48
 
@@ -50,11 +50,11 @@ Fishel, Mark},
50
50
  },
51
51
  )
52
52
 
53
- def load_data(self) -> None:
53
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
54
54
  """Load dataset from HuggingFace hub"""
55
55
  if self.data_loaded:
56
56
  return
57
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
57
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
58
58
  self.dataset_transform()
59
59
  self.data_loaded = True
60
60
 
@@ -71,7 +71,7 @@ Fishel, Mark},
71
71
  text2id = {}
72
72
 
73
73
  for split in self.dataset:
74
- ds: datasets.Dataset = self.dataset[split] # type: ignore
74
+ ds: datasets.Dataset = self.dataset[split]
75
75
  ds = ds.shuffle(seed=42)
76
76
  max_samples = min(1024, len(ds))
77
77
  ds = ds.select(
@@ -37,11 +37,11 @@ class SNLRetrieval(AbsTaskRetrieval):
37
37
  task_subtypes=["Article retrieval"],
38
38
  )
39
39
 
40
- def load_data(self) -> None:
40
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
41
41
  """Load dataset from HuggingFace hub"""
42
42
  if self.data_loaded:
43
43
  return
44
- self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
44
+ self.dataset = datasets.load_dataset(**self.metadata.dataset)
45
45
  self.dataset_transform()
46
46
  self.data_loaded = True
47
47
 
@@ -58,7 +58,7 @@ class SNLRetrieval(AbsTaskRetrieval):
58
58
  text2id = {}
59
59
 
60
60
  for split in self.dataset:
61
- ds: datasets.Dataset = self.dataset[split] # type: ignore
61
+ ds: datasets.Dataset = self.dataset[split]
62
62
  ds = ds.shuffle(seed=42)
63
63
 
64
64
  self.queries[split] = {}
@@ -36,7 +36,7 @@ class SlovakSumRetrieval(AbsTaskRetrieval):
36
36
  """,
37
37
  )
38
38
 
39
- def load_data(self) -> None:
39
+ def load_data(self, num_proc: int = 1, **kwargs) -> None:
40
40
  if self.data_loaded:
41
41
  return
42
42
  self.corpus, self.queries, self.relevant_docs = {}, {}, {}
@@ -59,7 +59,7 @@ class TurHistQuadRetrieval(AbsTaskRetrieval):
59
59
  text2id = {}
60
60
 
61
61
  for split in self.metadata.eval_splits:
62
- ds: datasets.Dataset = self.dataset[split] # type: ignore
62
+ ds: datasets.Dataset = self.dataset[split]
63
63
  ds = ds.shuffle(seed=42)
64
64
  max_samples = min(1024, len(ds))
65
65
  ds = ds.select(
@@ -1,5 +1,5 @@
1
1
  from .argu_ana_vn_retrieval import ArguAnaVN
2
- from .climate_fevervn_retrieval import ClimateFEVERVN
2
+ from .climate_fevervn_retrieval import ClimateFEVERVN, NanoClimateFEVERVN
3
3
  from .cqa_dupstack_android_vn_retrieval import CQADupstackAndroidVN
4
4
  from .cqa_dupstack_gis_vn_retrieval import CQADupstackGisVN
5
5
  from .cqa_dupstack_mathematica_vn_retrieval import CQADupstackMathematicaVN
@@ -10,19 +10,20 @@ from .cqa_dupstack_tex_vn_retrieval import CQADupstackTexVN
10
10
  from .cqa_dupstack_unix_vn_retrieval import CQADupstackUnixVN
11
11
  from .cqa_dupstack_webmasters_vn_retrieval import CQADupstackWebmastersVN
12
12
  from .cqa_dupstack_wordpress_vn_retrieval import CQADupstackWordpressVN
13
- from .db_pedia_vn_retrieval import DBPediaVN
14
- from .fevervn_retrieval import FEVERVN
13
+ from .db_pedia_vn_retrieval import DBPediaVN, NanoDBPediaVN
14
+ from .fevervn_retrieval import FEVERVN, NanoFEVERVN
15
15
  from .fi_qa2018_vn_retrieval import FiQA2018VN
16
16
  from .green_node_table_markdown_retrieval import GreenNodeTableMarkdownRetrieval
17
- from .hotpot_qavn_retrieval import HotpotQAVN
18
- from .msmarcovn_retrieval import MSMARCOVN
17
+ from .hotpot_qavn_retrieval import HotpotQAVN, NanoHotpotQAVN
18
+ from .msmarcovn_retrieval import MSMARCOVN, NanoMSMARCOVN
19
19
  from .nf_corpus_vn_retrieval import NFCorpusVN
20
- from .nqvn_retrieval import NQVN
20
+ from .nqvn_retrieval import NQVN, NanoNQVN
21
21
  from .quora_vn_retrieval import QuoraVN
22
22
  from .sci_fact_vn_retrieval import SciFactVN
23
23
  from .scidocsvn_retrieval import SCIDOCSVN
24
24
  from .touche2020_vn_retrieval import Touche2020VN
25
25
  from .treccovidvn_retrieval import TRECCOVIDVN
26
+ from .tvpl_retrieval import TVPLRetrieval
26
27
  from .vie_qu_ad_retrieval import VieQuADRetrieval
27
28
  from .zac_legal_text_retrieval import ZacLegalTextRetrieval
28
29
 
@@ -49,8 +50,15 @@ __all__ = [
49
50
  "GreenNodeTableMarkdownRetrieval",
50
51
  "HotpotQAVN",
51
52
  "NFCorpusVN",
53
+ "NanoClimateFEVERVN",
54
+ "NanoDBPediaVN",
55
+ "NanoFEVERVN",
56
+ "NanoHotpotQAVN",
57
+ "NanoMSMARCOVN",
58
+ "NanoNQVN",
52
59
  "QuoraVN",
53
60
  "SciFactVN",
61
+ "TVPLRetrieval",
54
62
  "Touche2020VN",
55
63
  "VieQuADRetrieval",
56
64
  "ZacLegalTextRetrieval",