mteb 2.0.5__py3-none-any.whl → 2.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (412) hide show
  1. mteb/__init__.py +10 -1
  2. mteb/_create_dataloaders.py +8 -3
  3. mteb/_evaluators/any_sts_evaluator.py +14 -12
  4. mteb/_evaluators/clustering_evaluator.py +1 -1
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +2 -2
  6. mteb/_evaluators/pair_classification_evaluator.py +3 -1
  7. mteb/_evaluators/retrieval_metrics.py +0 -9
  8. mteb/_evaluators/sklearn_evaluator.py +15 -28
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  10. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  12. mteb/abstasks/_stratification.py +1 -1
  13. mteb/abstasks/abstask.py +6 -1
  14. mteb/abstasks/clustering.py +1 -1
  15. mteb/abstasks/dataset_card_template.md +1 -1
  16. mteb/abstasks/multilabel_classification.py +2 -2
  17. mteb/abstasks/retrieval.py +2 -1
  18. mteb/abstasks/retrieval_dataset_loaders.py +1 -1
  19. mteb/abstasks/task_metadata.py +2 -1
  20. mteb/benchmarks/_create_table.py +1 -3
  21. mteb/benchmarks/benchmark.py +18 -1
  22. mteb/benchmarks/benchmarks/__init__.py +4 -0
  23. mteb/benchmarks/benchmarks/benchmarks.py +125 -16
  24. mteb/benchmarks/get_benchmark.py +3 -1
  25. mteb/cache.py +7 -3
  26. mteb/descriptive_stats/Classification/DutchColaClassification.json +54 -0
  27. mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json +54 -0
  28. mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json +90 -0
  29. mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json +54 -0
  30. mteb/descriptive_stats/Classification/IconclassClassification.json +96 -0
  31. mteb/descriptive_stats/Classification/OpenTenderClassification.json +222 -0
  32. mteb/descriptive_stats/Classification/VaccinChatNLClassification.json +1068 -0
  33. mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringP2P.json +45 -0
  34. mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringS2S.json +45 -0
  35. mteb/descriptive_stats/Clustering/IconclassClusteringS2S.json +48 -0
  36. mteb/descriptive_stats/Clustering/OpenTenderClusteringP2P.json +111 -0
  37. mteb/descriptive_stats/Clustering/OpenTenderClusteringS2S.json +111 -0
  38. mteb/descriptive_stats/Clustering/VABBClusteringP2P.json +60 -0
  39. mteb/descriptive_stats/Clustering/VABBClusteringS2S.json +60 -0
  40. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XFlickr30kCoT2IRetrieval.json +243 -153
  41. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XM3600T2IRetrieval.json +999 -629
  42. mteb/descriptive_stats/Image/Any2AnyRetrieval/OVENIT2TRetrieval.json +33 -17
  43. mteb/descriptive_stats/Image/DocumentUnderstanding/MIRACLVisionRetrieval.json +574 -0
  44. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  45. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  46. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  47. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  48. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  49. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  50. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  54. mteb/descriptive_stats/MultilabelClassification/CovidDisinformationNLMultiLabelClassification.json +84 -0
  55. mteb/descriptive_stats/MultilabelClassification/VABBMultiLabelClassification.json +156 -0
  56. mteb/descriptive_stats/PairClassification/SICKNLPairClassification.json +35 -0
  57. mteb/descriptive_stats/PairClassification/XLWICNLPairClassification.json +35 -0
  58. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  59. mteb/descriptive_stats/Retrieval/ClimateFEVERHardNegatives.v2.json +30 -0
  60. mteb/descriptive_stats/Retrieval/DBPediaHardNegatives.v2.json +30 -0
  61. mteb/descriptive_stats/Retrieval/DutchNewsArticlesRetrieval.json +30 -0
  62. mteb/descriptive_stats/Retrieval/FEVERHardNegatives.v2.json +30 -0
  63. mteb/descriptive_stats/Retrieval/HotpotQAHardNegatives.v2.json +30 -0
  64. mteb/descriptive_stats/Retrieval/LegalQANLRetrieval.json +30 -0
  65. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  66. mteb/descriptive_stats/Retrieval/OpenTenderRetrieval.json +30 -0
  67. mteb/descriptive_stats/Retrieval/QuoraRetrievalHardNegatives.v2.json +30 -0
  68. mteb/descriptive_stats/Retrieval/RiaNewsRetrievalHardNegatives.v2.json +30 -0
  69. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  71. mteb/descriptive_stats/Retrieval/VABBRetrieval.json +30 -0
  72. mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json +184 -0
  73. mteb/descriptive_stats/Retrieval/WinoGrande.json +14 -14
  74. mteb/descriptive_stats/Retrieval/bBSARDNLRetrieval.json +30 -0
  75. mteb/descriptive_stats/STS/SICK-NL-STS.json +28 -0
  76. mteb/evaluate.py +26 -6
  77. mteb/languages/check_language_code.py +11 -3
  78. mteb/languages/language_scripts.py +4 -0
  79. mteb/leaderboard/app.py +5 -3
  80. mteb/leaderboard/benchmark_selector.py +4 -2
  81. mteb/leaderboard/text_segments.py +1 -1
  82. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  83. mteb/models/instruct_wrapper.py +3 -0
  84. mteb/models/model_implementations/align_models.py +6 -0
  85. mteb/models/model_implementations/andersborges.py +51 -0
  86. mteb/models/model_implementations/ara_models.py +7 -0
  87. mteb/models/model_implementations/b1ade_models.py +1 -1
  88. mteb/models/model_implementations/bge_models.py +1 -3
  89. mteb/models/model_implementations/blip2_models.py +9 -0
  90. mteb/models/model_implementations/blip_models.py +19 -0
  91. mteb/models/model_implementations/bmretriever_models.py +1 -1
  92. mteb/models/model_implementations/cadet_models.py +8 -0
  93. mteb/models/model_implementations/cde_models.py +12 -0
  94. mteb/models/model_implementations/codefuse_models.py +15 -0
  95. mteb/models/model_implementations/codesage_models.py +12 -0
  96. mteb/models/model_implementations/cohere_models.py +1 -1
  97. mteb/models/model_implementations/colqwen_models.py +57 -0
  98. mteb/models/model_implementations/emillykkejensen_models.py +70 -0
  99. mteb/models/model_implementations/gme_v_models.py +2 -2
  100. mteb/models/model_implementations/ibm_granite_models.py +1 -1
  101. mteb/models/model_implementations/inf_models.py +3 -3
  102. mteb/models/model_implementations/jasper_models.py +253 -2
  103. mteb/models/model_implementations/jina_models.py +12 -2
  104. mteb/models/model_implementations/kalm_models.py +159 -25
  105. mteb/models/model_implementations/llm2vec_models.py +1 -1
  106. mteb/models/model_implementations/misc_models.py +8 -2
  107. mteb/models/model_implementations/moco_models.py +9 -0
  108. mteb/models/model_implementations/mxbai_models.py +1 -1
  109. mteb/models/model_implementations/openclip_models.py +16 -0
  110. mteb/models/model_implementations/piccolo_models.py +6 -0
  111. mteb/models/model_implementations/rasgaard_models.py +33 -0
  112. mteb/models/model_implementations/reasonir_model.py +1 -1
  113. mteb/models/model_implementations/salesforce_models.py +1 -1
  114. mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -1
  115. mteb/models/model_implementations/spartan8806_atles_champion.py +26 -0
  116. mteb/models/model_implementations/tarka_models.py +374 -0
  117. mteb/models/model_implementations/voyage_models.py +6 -7
  118. mteb/models/model_implementations/voyage_v.py +10 -9
  119. mteb/models/model_implementations/yuan_models.py +33 -0
  120. mteb/models/search_wrappers.py +6 -5
  121. mteb/results/task_result.py +19 -17
  122. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  123. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  124. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +1 -5
  125. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  126. mteb/tasks/classification/ara/ajgt.py +1 -2
  127. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  128. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  129. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  130. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  131. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  132. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  133. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  134. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  135. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  136. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  137. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  138. mteb/tasks/classification/dan/angry_tweets_classification.py +2 -3
  139. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  140. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  141. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  142. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  143. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  144. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  145. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  146. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  147. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  148. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  149. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  150. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  151. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  152. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  153. mteb/tasks/classification/eng/legal_bench_classification.py +15 -121
  154. mteb/tasks/classification/eng/news_classification.py +1 -2
  155. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  156. mteb/tasks/classification/eng/patent_classification.py +1 -2
  157. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  158. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  159. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  160. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  161. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  162. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  163. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  164. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  165. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  166. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  167. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  168. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  169. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  170. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  171. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  172. mteb/tasks/classification/est/estonian_valence.py +1 -2
  173. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  174. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  175. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  176. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  177. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  178. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  179. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +1 -2
  180. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  181. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  182. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  183. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  184. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  185. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  186. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  187. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  188. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  189. mteb/tasks/classification/kor/klue_tc.py +1 -2
  190. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  191. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  192. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  193. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  194. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  195. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  196. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  197. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  198. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  199. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  200. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  201. mteb/tasks/classification/mya/myanmar_news.py +2 -3
  202. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  203. mteb/tasks/classification/nld/__init__.py +16 -0
  204. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  205. mteb/tasks/classification/nld/dutch_cola_classification.py +41 -0
  206. mteb/tasks/classification/nld/dutch_government_bias_classification.py +40 -0
  207. mteb/tasks/classification/nld/dutch_news_articles_classification.py +33 -0
  208. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +39 -0
  209. mteb/tasks/classification/nld/iconclass_classification.py +44 -0
  210. mteb/tasks/classification/nld/open_tender_classification.py +41 -0
  211. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +49 -0
  212. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  213. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  214. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  215. mteb/tasks/classification/pol/polish_classification.py +3 -6
  216. mteb/tasks/classification/ron/moroco.py +1 -2
  217. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  218. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  219. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  220. mteb/tasks/classification/rus/headline_classification.py +1 -2
  221. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  222. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  223. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  224. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  225. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  226. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  227. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  228. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  229. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  230. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  231. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  232. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  233. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  234. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  235. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  236. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  237. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  238. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  239. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  240. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  241. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  242. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  243. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  244. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  245. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  246. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  247. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  248. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  249. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  250. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  251. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  252. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  253. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  254. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  255. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  256. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  257. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  258. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  259. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  260. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  261. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  262. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  263. mteb/tasks/clustering/__init__.py +1 -0
  264. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  265. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  266. mteb/tasks/clustering/nld/__init__.py +17 -0
  267. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +40 -0
  268. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +40 -0
  269. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +50 -0
  270. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +54 -0
  271. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +44 -0
  272. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +54 -0
  273. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +54 -0
  274. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  275. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  276. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  277. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  278. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  279. mteb/tasks/multilabel_classification/__init__.py +1 -0
  280. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  281. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  282. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  283. mteb/tasks/multilabel_classification/nld/__init__.py +9 -0
  284. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +91 -0
  285. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +47 -0
  286. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  287. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  288. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  289. mteb/tasks/pair_classification/__init__.py +1 -0
  290. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  291. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  292. mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py +9 -8
  293. mteb/tasks/pair_classification/nld/__init__.py +7 -0
  294. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +39 -0
  295. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +44 -0
  296. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  297. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  298. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  299. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  300. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  301. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  302. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  303. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  304. mteb/tasks/retrieval/code/code_rag.py +8 -8
  305. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  306. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  307. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  308. mteb/tasks/retrieval/eng/__init__.py +18 -4
  309. mteb/tasks/retrieval/eng/climate_fever_retrieval.py +68 -77
  310. mteb/tasks/retrieval/eng/dbpedia_retrieval.py +55 -50
  311. mteb/tasks/retrieval/eng/fever_retrieval.py +62 -67
  312. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +0 -4
  313. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +0 -4
  314. mteb/tasks/retrieval/eng/hotpot_qa_retrieval.py +57 -67
  315. mteb/tasks/retrieval/eng/legal_summarization_retrieval.py +1 -1
  316. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  317. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +0 -3
  318. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +0 -2
  319. mteb/tasks/retrieval/eng/oven_it2t_retrieval.py +1 -1
  320. mteb/tasks/retrieval/eng/quora_retrieval.py +51 -46
  321. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +0 -4
  322. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +0 -4
  323. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +0 -2
  324. mteb/tasks/retrieval/eng/wino_grande_retrieval.py +1 -1
  325. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  326. mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py +1 -1
  327. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  328. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  329. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +6 -5
  330. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +0 -2
  331. mteb/tasks/retrieval/multilingual/miracl_retrieval.py +1 -1
  332. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +2 -9
  333. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  334. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  335. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  336. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  337. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  338. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +0 -2
  339. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  340. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +0 -2
  341. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +6 -5
  342. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +3 -4
  343. mteb/tasks/retrieval/nld/__init__.py +18 -4
  344. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  345. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +44 -0
  346. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +33 -0
  347. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +42 -0
  348. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  349. mteb/tasks/retrieval/nld/open_tender_retrieval.py +41 -0
  350. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  351. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  352. mteb/tasks/retrieval/nld/vabb_retrieval.py +44 -0
  353. mteb/tasks/retrieval/nob/norquad.py +2 -2
  354. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  355. mteb/tasks/retrieval/rus/__init__.py +11 -2
  356. mteb/tasks/retrieval/rus/ria_news_retrieval.py +48 -44
  357. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  358. mteb/tasks/retrieval/tur/tur_hist_quad.py +2 -2
  359. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  360. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  361. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  362. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  363. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  364. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  365. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  366. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  367. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  368. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  369. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  370. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  371. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  372. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  373. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  374. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  375. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  376. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  377. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  378. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  379. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  380. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  381. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  382. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  383. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  384. mteb/tasks/sts/__init__.py +1 -0
  385. mteb/tasks/sts/nld/__init__.py +5 -0
  386. mteb/tasks/sts/nld/sick_nl_sts.py +42 -0
  387. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  388. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  389. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  390. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  391. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  392. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  393. mteb-2.1.19.dist-info/METADATA +253 -0
  394. {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/RECORD +398 -330
  395. mteb/descriptive_stats/Classification/PersianTextTone.json +0 -56
  396. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json +0 -37
  397. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json +0 -25
  398. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json +0 -25
  399. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json +0 -25
  400. mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json +0 -20
  401. mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json +0 -20
  402. mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json +0 -20
  403. mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json +0 -20
  404. mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json +0 -20
  405. mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json +0 -220
  406. mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json +0 -402
  407. mteb/descriptive_stats/Reranking/InstructIR.json +0 -31
  408. mteb-2.0.5.dist-info/METADATA +0 -455
  409. {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/WHEEL +0 -0
  410. {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/entry_points.txt +0 -0
  411. {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/licenses/LICENSE +0 -0
  412. {mteb-2.0.5.dist-info → mteb-2.1.19.dist-info}/top_level.txt +0 -0
mteb/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from importlib.metadata import version
2
2
 
3
+ from mteb import types
3
4
  from mteb.abstasks import AbsTask
4
5
  from mteb.abstasks.task_metadata import TaskMetadata
5
6
  from mteb.deprecated_evaluator import MTEB
@@ -7,7 +8,12 @@ from mteb.evaluate import evaluate
7
8
  from mteb.filter_tasks import filter_tasks
8
9
  from mteb.get_tasks import get_task, get_tasks
9
10
  from mteb.load_results import load_results
10
- from mteb.models import EncoderProtocol, SentenceTransformerEncoderWrapper
11
+ from mteb.models import (
12
+ CrossEncoderProtocol,
13
+ EncoderProtocol,
14
+ SearchProtocol,
15
+ SentenceTransformerEncoderWrapper,
16
+ )
11
17
  from mteb.models.get_model_meta import get_model, get_model_meta, get_model_metas
12
18
  from mteb.results import BenchmarkResults, TaskResult
13
19
 
@@ -21,7 +27,9 @@ __all__ = [
21
27
  "AbsTask",
22
28
  "Benchmark",
23
29
  "BenchmarkResults",
30
+ "CrossEncoderProtocol",
24
31
  "EncoderProtocol",
32
+ "SearchProtocol",
25
33
  "SentenceTransformerEncoderWrapper",
26
34
  "TaskMetadata",
27
35
  "TaskResult",
@@ -35,4 +43,5 @@ __all__ = [
35
43
  "get_task",
36
44
  "get_tasks",
37
45
  "load_results",
46
+ "types",
38
47
  ]
@@ -3,7 +3,7 @@ from collections.abc import Callable
3
3
  from typing import Any, cast
4
4
 
5
5
  import torch
6
- from datasets import Dataset
6
+ from datasets import Dataset, Image
7
7
  from torch.utils.data import DataLoader, default_collate
8
8
 
9
9
  from mteb.abstasks.task_metadata import TaskMetadata
@@ -22,12 +22,14 @@ logger = logging.getLogger(__name__)
22
22
  def _create_dataloader_from_texts(
23
23
  text: list[str],
24
24
  batch_size: int = 32,
25
+ **kwargs: dict[str, Any],
25
26
  ) -> DataLoader[TextInput]:
26
27
  """Create a dataloader from a list of text.
27
28
 
28
29
  Args:
29
30
  text: A list of text to create a dataloader from.
30
31
  batch_size: Batch size for the dataloader.
32
+ kwargs: Not used, present catching extra arguments.
31
33
 
32
34
  Returns:
33
35
  A dataloader with the text.
@@ -244,14 +246,15 @@ def _prepare_image_dataset(
244
246
  transform: Callable[[Any], Any] | None = None,
245
247
  ) -> Dataset:
246
248
  """Prepare the image dataset by converting images to RGB and applying transformations."""
247
- # If the dataset uses a different column name for images, rename it to "image".
248
249
  if (
249
250
  image_column_name
250
251
  and image_column_name in dataset.column_names
251
252
  and "image" not in dataset.column_names
252
253
  ):
253
254
  dataset = dataset.rename_column(image_column_name, "image")
254
- # Map the conversion function over the dataset.
255
+ # don't process image if it's already in the correct format
256
+ if isinstance(dataset.features["image"], Image):
257
+ return dataset
255
258
  return dataset.map(
256
259
  _convert_images_to_rgb,
257
260
  fn_kwargs={"image_col_name": "image", "transform": transform},
@@ -277,6 +280,8 @@ def _custom_collate_fn(batch: list[dict[str, Any]]) -> dict[str, Any]:
277
280
  # Leave the images as a list to avoid stacking errors.
278
281
  collated[key] = [item[key] for item in batch]
279
282
  else:
283
+ if any(item[key] is None for item in batch):
284
+ raise ValueError(f"Found None in batch for key '{key}'")
280
285
  collated[key] = default_collate([item[key] for item in batch])
281
286
  return collated
282
287
 
@@ -45,16 +45,8 @@ class AnySTSEvaluator(Evaluator):
45
45
  **kwargs,
46
46
  ) -> None:
47
47
  super().__init__(**kwargs)
48
- self.first_column = create_dataloader(
49
- dataset,
50
- task_metadata,
51
- input_column=sentences_column_names[0],
52
- )
53
- self.second_column = create_dataloader(
54
- dataset,
55
- task_metadata,
56
- input_column=sentences_column_names[1],
57
- )
48
+ self.dataset = dataset
49
+ self.input_columns = sentences_column_names
58
50
  self.task_metadata = task_metadata
59
51
  self.hf_split = hf_split
60
52
  self.hf_subset = hf_subset
@@ -67,7 +59,12 @@ class AnySTSEvaluator(Evaluator):
67
59
  ) -> STSEvaluatorScores:
68
60
  logger.info("Running semantic similarity - Encoding samples (1/2)")
69
61
  embeddings1 = model.encode(
70
- self.first_column,
62
+ create_dataloader(
63
+ self.dataset,
64
+ self.task_metadata,
65
+ input_column=self.input_columns[0],
66
+ **encode_kwargs,
67
+ ),
71
68
  task_metadata=self.task_metadata,
72
69
  hf_split=self.hf_split,
73
70
  hf_subset=self.hf_subset,
@@ -76,7 +73,12 @@ class AnySTSEvaluator(Evaluator):
76
73
 
77
74
  logger.info("Running semantic similarity - Encoding samples (2/2)...")
78
75
  embeddings2 = model.encode(
79
- self.second_column,
76
+ create_dataloader(
77
+ self.dataset,
78
+ self.task_metadata,
79
+ input_column=self.input_columns[1],
80
+ **encode_kwargs,
81
+ ),
80
82
  task_metadata=self.task_metadata,
81
83
  hf_split=self.hf_split,
82
84
  hf_subset=self.hf_subset,
@@ -44,7 +44,7 @@ class ClusteringEvaluator(Evaluator):
44
44
  self.dataset,
45
45
  self.task_metadata,
46
46
  input_column=self.input_column_name,
47
- batch_size=encode_kwargs["batch_size"],
47
+ **encode_kwargs,
48
48
  )
49
49
 
50
50
  logger.info("Running clustering - Encoding samples...")
@@ -103,7 +103,7 @@ class ImageTextPairClassificationEvaluator(Evaluator):
103
103
  text_embeddings = model.encode(
104
104
  DataLoader(
105
105
  Dataset.from_dict({"text": texts}),
106
- batch_size=encode_kwargs["batch_size"],
106
+ **encode_kwargs,
107
107
  ),
108
108
  task_metadata=self.task_metadata,
109
109
  hf_subset=self.hf_subset,
@@ -122,8 +122,8 @@ class ImageTextPairClassificationEvaluator(Evaluator):
122
122
  image_embeddings = model.encode(
123
123
  DataLoader(
124
124
  CustomImageDataset(images),
125
- batch_size=encode_kwargs["batch_size"],
126
125
  collate_fn=lambda x: {"image": [item["image"] for item in x]},
126
+ **encode_kwargs,
127
127
  ),
128
128
  task_metadata=self.task_metadata,
129
129
  hf_subset=self.hf_subset,
@@ -106,6 +106,7 @@ class PairClassificationEvaluator(Evaluator):
106
106
  self.dataset,
107
107
  task_metadata=self.task_metadata,
108
108
  input_column=self.input1_column_name,
109
+ **encode_kwargs,
109
110
  ),
110
111
  task_metadata=self.task_metadata,
111
112
  hf_split=self.hf_split,
@@ -117,6 +118,7 @@ class PairClassificationEvaluator(Evaluator):
117
118
  self.dataset,
118
119
  task_metadata=self.task_metadata,
119
120
  input_column=self.input2_column_name,
121
+ **encode_kwargs,
120
122
  ),
121
123
  task_metadata=self.task_metadata,
122
124
  hf_split=self.hf_split,
@@ -168,7 +170,7 @@ class PairClassificationEvaluator(Evaluator):
168
170
  )
169
171
  all_unique_texts_embs = np.asarray(
170
172
  model.encode(
171
- _create_dataloader_from_texts(all_unique_texts),
173
+ _create_dataloader_from_texts(all_unique_texts, **encode_kwargs),
172
174
  task_metadata=task_metadata,
173
175
  hf_split=hf_split,
174
176
  hf_subset=hf_subset,
@@ -5,7 +5,6 @@ from typing import Any
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  import pytrec_eval
8
- import torch
9
8
  from packaging.version import Version
10
9
  from sklearn.metrics import auc
11
10
 
@@ -14,14 +13,6 @@ from mteb.types import RelevantDocumentsType, RetrievalEvaluationResult
14
13
  logger = logging.getLogger(__name__)
15
14
 
16
15
 
17
- try:
18
- # speeds up computation if available
19
- torch.set_float32_matmul_precision("high")
20
- logger.info("Setting torch float32 matmul precision to high for a speedup")
21
- except Exception:
22
- pass
23
-
24
-
25
16
  def mrr(
26
17
  qrels: RelevantDocumentsType,
27
18
  results: dict[str, dict[str, float]],
@@ -6,7 +6,7 @@ from datasets import Dataset
6
6
  from torch.utils.data import DataLoader
7
7
  from typing_extensions import Self
8
8
 
9
- from mteb._create_dataloaders import _create_image_dataloader
9
+ from mteb._create_dataloaders import create_dataloader
10
10
  from mteb.abstasks.task_metadata import TaskMetadata
11
11
  from mteb.models import EncoderProtocol
12
12
  from mteb.types import BatchedInput
@@ -50,33 +50,20 @@ class SklearnEvaluator(Evaluator):
50
50
  self.evaluator_model = evaluator_model
51
51
 
52
52
  def create_dataloaders(
53
- self, batch_size: int
53
+ self, encode_kwargs: dict[str, Any]
54
54
  ) -> tuple[DataLoader[BatchedInput], DataLoader[BatchedInput]]:
55
- if self.task_metadata.modalities == ["image"]:
56
- dataloader_train = _create_image_dataloader(
57
- self.train_dataset,
58
- image_column_name=self.values_column_name,
59
- batch_size=batch_size,
60
- )
61
- dataloader_test = _create_image_dataloader(
62
- self.eval_dataset,
63
- image_column_name=self.values_column_name,
64
- batch_size=batch_size,
65
- )
66
- elif self.task_metadata.modalities == ["text"]:
67
- if self.values_column_name != "text":
68
- self.train_dataset = self.train_dataset.rename_column(
69
- self.values_column_name, "text"
70
- )
71
- self.eval_dataset = self.eval_dataset.rename_column(
72
- self.values_column_name, "text"
73
- )
74
- dataloader_train = DataLoader(self.train_dataset)
75
- dataloader_test = DataLoader(self.eval_dataset)
76
- else:
77
- raise ValueError(
78
- "ClassificationEvaluator only supports image and text modalities."
79
- )
55
+ dataloader_train = create_dataloader(
56
+ self.train_dataset,
57
+ self.task_metadata,
58
+ input_column=self.values_column_name,
59
+ **encode_kwargs,
60
+ )
61
+ dataloader_test = create_dataloader(
62
+ self.eval_dataset,
63
+ self.task_metadata,
64
+ input_column=self.values_column_name,
65
+ **encode_kwargs,
66
+ )
80
67
  return dataloader_train, dataloader_test
81
68
 
82
69
  def __call__( # type: ignore[override]
@@ -98,7 +85,7 @@ class SklearnEvaluator(Evaluator):
98
85
 
99
86
  """
100
87
  dataloader_train, dataloader_test = self.create_dataloaders(
101
- batch_size=encode_kwargs["batch_size"]
88
+ encode_kwargs=encode_kwargs,
102
89
  )
103
90
 
104
91
  logger.info("Running - Encoding samples...")
@@ -46,7 +46,10 @@ class BitextMiningEvaluator(Evaluator):
46
46
 
47
47
  embeddings = {}
48
48
  for sub in tqdm(subsets):
49
- dataloader = _create_dataloader_from_texts(self.sentences[sub])
49
+ dataloader = _create_dataloader_from_texts(
50
+ self.sentences[sub],
51
+ **encode_kwargs,
52
+ )
50
53
  embeddings[sub] = model.encode(
51
54
  dataloader,
52
55
  task_metadata=self.task_metadata,
@@ -109,7 +109,8 @@ class SummarizationEvaluator(Evaluator):
109
109
  summary
110
110
  for human_summaries in self.human_summaries
111
111
  for summary in human_summaries
112
- ]
112
+ ],
113
+ **encode_kwargs,
113
114
  ),
114
115
  task_metadata=self.task_metadata,
115
116
  hf_subset=self.hf_subset,
@@ -124,7 +125,8 @@ class SummarizationEvaluator(Evaluator):
124
125
  summary
125
126
  for machine_summaries in self.machine_summaries
126
127
  for summary in machine_summaries
127
- ]
128
+ ],
129
+ **encode_kwargs,
128
130
  ),
129
131
  task_metadata=self.task_metadata,
130
132
  hf_subset=self.hf_subset,
@@ -42,14 +42,14 @@ class ZeroShotClassificationEvaluator(Evaluator):
42
42
  ) -> Array:
43
43
  dataloader = create_dataloader(
44
44
  self.dataset,
45
- batch_size=encode_kwargs["batch_size"],
46
45
  input_column=self.input_column_name,
47
46
  task_metadata=self.task_metadata,
47
+ **encode_kwargs,
48
48
  )
49
49
 
50
50
  logger.info("Running zero-shot classification - Encoding labels...")
51
51
  text_label_embeddings = model.encode(
52
- _create_dataloader_from_texts(self.candidate_labels),
52
+ _create_dataloader_from_texts(self.candidate_labels, **encode_kwargs),
53
53
  task_metadata=self.task_metadata,
54
54
  hf_subset=self.hf_subset,
55
55
  hf_split=self.hf_split,
@@ -134,7 +134,7 @@ def _get_most_desired_combination(samples_with_combination: dict):
134
134
  class IterativeStratification(_BaseKFold):
135
135
  """Iteratively stratify a multi-label data set into folds
136
136
 
137
- Construct an interative stratifier that splits the data set into folds trying to maintain balanced representation
137
+ Construct an iterative stratifier that splits the data set into folds trying to maintain balanced representation
138
138
  with respect to order-th label combinations.
139
139
  """
140
140
 
mteb/abstasks/abstask.py CHANGED
@@ -459,7 +459,7 @@ class AbsTask(ABC):
459
459
  """Filter the languages of the task.
460
460
 
461
461
  Args:
462
- languages: list of languages to filter the task by can be either a 3-letter langauge code (e.g. "eng") or also include the script
462
+ languages: list of languages to filter the task by can be either a 3-letter language code (e.g. "eng") or also include the script
463
463
  (e.g. "eng-Latn")
464
464
  script: A list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included.
465
465
  If the language code does not specify the script the intersection of the language and script will be used.
@@ -491,6 +491,11 @@ class AbsTask(ABC):
491
491
  if lang_scripts.contains_languages(langs):
492
492
  subsets_to_keep.append(hf_subset)
493
493
 
494
+ if len(subsets_to_keep) == 0:
495
+ raise ValueError(
496
+ f"No subsets were found for {self.metadata.name} with filters: language code {languages}, script {script}, hf subsets {hf_subsets}."
497
+ )
498
+
494
499
  self.hf_subsets = subsets_to_keep
495
500
  return self
496
501
 
@@ -200,7 +200,7 @@ class AbsTaskClustering(AbsTask):
200
200
  downsampled_dataset,
201
201
  self.metadata,
202
202
  input_column=self.input_column_name,
203
- batch_size=encode_kwargs["batch_size"],
203
+ **encode_kwargs,
204
204
  ),
205
205
  task_metadata=self.metadata,
206
206
  hf_subset=hf_subset,
@@ -85,7 +85,7 @@ desc_stats = task.metadata.descriptive_stats
85
85
  ```
86
86
 
87
87
  ```json
88
- {{ descritptive_stats | default("{}", true) }}
88
+ {{ descriptive_stats | default("{}", true) }}
89
89
  ```
90
90
 
91
91
  </details>
@@ -112,7 +112,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
112
112
  unique_train_dataset,
113
113
  self.metadata,
114
114
  input_column=self.input_column_name,
115
- batch_size=encode_kwargs["batch_size"],
115
+ **encode_kwargs,
116
116
  )
117
117
 
118
118
  logger.info("Running multilabel classification - Encoding training set...")
@@ -141,7 +141,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
141
141
  test_dataset.select_columns(self.input_column_name),
142
142
  self.metadata,
143
143
  input_column=self.input_column_name,
144
- batch_size=encode_kwargs["batch_size"],
144
+ **encode_kwargs,
145
145
  )
146
146
 
147
147
  logger.info("Running multilabel classification - Encoding test set...")
@@ -653,6 +653,8 @@ class AbsTaskRetrieval(AbsTask):
653
653
  FileNotFoundError: If the specified path does not exist.
654
654
  ValueError: If the loaded top ranked results are not in the expected format.
655
655
  """
656
+ self._top_k = top_k
657
+
656
658
  top_ranked_path = Path(top_ranked_path)
657
659
  if top_ranked_path.is_dir():
658
660
  top_ranked_path = self._predictions_path(top_ranked_path)
@@ -682,7 +684,6 @@ class AbsTaskRetrieval(AbsTask):
682
684
  top_k_sorted[query_id] = sorted_keys[: self._top_k]
683
685
 
684
686
  self.dataset[subset][split]["top_ranked"] = top_k_sorted
685
- self._top_k = top_k
686
687
  return self
687
688
 
688
689
 
@@ -176,7 +176,7 @@ class RetrievalDatasetLoader:
176
176
  {
177
177
  "query-id": Value("string"),
178
178
  "corpus-id": Value("string"),
179
- "score": Value("uint16"),
179
+ "score": Value("int32"),
180
180
  }
181
181
  )
182
182
  )
@@ -107,6 +107,7 @@ The domains follow the categories used in the [Universal Dependencies project](h
107
107
  SampleCreationMethod = Literal[
108
108
  "found",
109
109
  "created",
110
+ "created and machine-translated",
110
111
  "human-translated and localized",
111
112
  "human-translated",
112
113
  "machine-translated",
@@ -532,7 +533,7 @@ class TaskMetadata(BaseModel):
532
533
  citation=self.bibtex_citation,
533
534
  dataset_description=self.description,
534
535
  dataset_reference=self.reference,
535
- descritptive_stats=descriptive_stats,
536
+ descriptive_stats=descriptive_stats,
536
537
  dataset_task_name=self.name,
537
538
  category=self.category,
538
539
  domains=", ".join(self.domains) if self.domains else None,
@@ -358,9 +358,7 @@ def _create_summary_table_mean_public_private(
358
358
  "mean(public)": "Mean (Public)",
359
359
  "mean(private)": "Mean (Private)",
360
360
  }
361
- # For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
362
- if "Retrieval" in joint_table.columns:
363
- rename_dict["Retrieval"] = "Mean (Task)"
361
+
364
362
  joint_table = joint_table.rename(columns=rename_dict)
365
363
 
366
364
  # Move borda rank to front
@@ -87,7 +87,10 @@ class RtebBenchmark(Benchmark):
87
87
  def _create_summary_table(
88
88
  self, benchmark_results: BenchmarkResults
89
89
  ) -> pd.DataFrame:
90
- return _create_summary_table_mean_public_private(benchmark_results)
90
+ joint_table = _create_summary_table_mean_public_private(benchmark_results)
91
+ # For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
92
+ joint_table = joint_table.rename(columns={"Retrieval": "Mean (Task)"})
93
+ return joint_table
91
94
 
92
95
 
93
96
  class HUMEBenchmark(Benchmark):
@@ -106,3 +109,17 @@ class MIEBBenchmark(Benchmark):
106
109
  self, benchmark_results: BenchmarkResults
107
110
  ) -> pd.DataFrame:
108
111
  return _create_summary_table_mean_task_type(benchmark_results)
112
+
113
+
114
+ class VidoreBenchmark(Benchmark):
115
+ """Wrapper for Vidore3 benchmark."""
116
+
117
+ def _create_summary_table(
118
+ self, benchmark_results: BenchmarkResults
119
+ ) -> pd.DataFrame:
120
+ joint_table = _create_summary_table_mean_public_private(benchmark_results)
121
+ # For ViDoRe (V1, V2, V3): all tasks are Document Understanding type, so Document Understanding column = Mean (Task)
122
+ joint_table = joint_table.rename(
123
+ columns={"Document Understanding": "Mean (Task)"}
124
+ )
125
+ return joint_table
@@ -27,6 +27,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
27
27
  MTEB_KOR,
28
28
  MTEB_MAIN_RU,
29
29
  MTEB_MINERS_BITEXT_MINING,
30
+ MTEB_NL,
30
31
  MTEB_POL,
31
32
  MTEB_RETRIEVAL_LAW,
32
33
  MTEB_RETRIEVAL_MEDICAL,
@@ -37,6 +38,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
37
38
  SEB,
38
39
  VIDORE,
39
40
  VIDORE_V2,
41
+ VIDORE_V3,
40
42
  VISUAL_DOCUMENT_RETRIEVAL,
41
43
  VN_MTEB,
42
44
  CoIR,
@@ -87,6 +89,7 @@ __all__ = [
87
89
  "MTEB_KOR",
88
90
  "MTEB_MAIN_RU",
89
91
  "MTEB_MINERS_BITEXT_MINING",
92
+ "MTEB_NL",
90
93
  "MTEB_POL",
91
94
  "MTEB_RETRIEVAL_LAW",
92
95
  "MTEB_RETRIEVAL_MEDICAL",
@@ -106,6 +109,7 @@ __all__ = [
106
109
  "SEB",
107
110
  "VIDORE",
108
111
  "VIDORE_V2",
112
+ "VIDORE_V3",
109
113
  "VISUAL_DOCUMENT_RETRIEVAL",
110
114
  "VN_MTEB",
111
115
  "CoIR",