mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/_create_dataloaders.py +6 -3
  3. mteb/_evaluators/any_sts_evaluator.py +21 -12
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +1 -1
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
  7. mteb/_evaluators/pair_classification_evaluator.py +30 -38
  8. mteb/_evaluators/sklearn_evaluator.py +15 -28
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  10. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  12. mteb/abstasks/_data_filter/__init__.py +0 -0
  13. mteb/abstasks/_data_filter/filters.py +125 -0
  14. mteb/abstasks/_data_filter/task_pipelines.py +102 -0
  15. mteb/abstasks/_statistics_calculation.py +6 -2
  16. mteb/abstasks/classification.py +0 -2
  17. mteb/abstasks/clustering.py +1 -1
  18. mteb/abstasks/clustering_legacy.py +3 -0
  19. mteb/abstasks/multilabel_classification.py +10 -3
  20. mteb/abstasks/pair_classification.py +8 -1
  21. mteb/abstasks/sts.py +7 -0
  22. mteb/abstasks/task_metadata.py +1 -0
  23. mteb/benchmarks/_create_table.py +84 -37
  24. mteb/benchmarks/benchmark.py +74 -15
  25. mteb/benchmarks/benchmarks/__init__.py +8 -0
  26. mteb/benchmarks/benchmarks/benchmarks.py +259 -15
  27. mteb/benchmarks/get_benchmark.py +2 -0
  28. mteb/cache.py +47 -10
  29. mteb/deprecated_evaluator.py +8 -13
  30. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  31. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  32. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  33. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  34. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  35. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  36. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  37. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  38. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  39. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  40. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  41. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  42. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  43. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  44. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  45. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  46. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  47. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  48. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  49. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  50. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  51. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  53. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  54. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  55. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  56. mteb/evaluate.py +65 -45
  57. mteb/leaderboard/app.py +268 -133
  58. mteb/leaderboard/benchmark_selector.py +14 -5
  59. mteb/leaderboard/figures.py +13 -15
  60. mteb/leaderboard/table.py +82 -17
  61. mteb/models/__init__.py +4 -1
  62. mteb/models/abs_encoder.py +21 -17
  63. mteb/models/cache_wrappers/__init__.py +2 -1
  64. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
  65. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  66. mteb/models/get_model_meta.py +3 -114
  67. mteb/models/instruct_wrapper.py +5 -1
  68. mteb/models/model_implementations/align_models.py +7 -0
  69. mteb/models/model_implementations/amazon_models.py +1 -0
  70. mteb/models/model_implementations/andersborges.py +65 -0
  71. mteb/models/model_implementations/ara_models.py +8 -0
  72. mteb/models/model_implementations/arctic_models.py +8 -0
  73. mteb/models/model_implementations/b1ade_models.py +1 -0
  74. mteb/models/model_implementations/bedrock_models.py +4 -0
  75. mteb/models/model_implementations/bge_models.py +60 -0
  76. mteb/models/model_implementations/bica_model.py +35 -0
  77. mteb/models/model_implementations/blip2_models.py +11 -0
  78. mteb/models/model_implementations/blip_models.py +27 -0
  79. mteb/models/model_implementations/bm25.py +1 -0
  80. mteb/models/model_implementations/bmretriever_models.py +4 -0
  81. mteb/models/model_implementations/cadet_models.py +9 -0
  82. mteb/models/model_implementations/cde_models.py +14 -0
  83. mteb/models/model_implementations/clip_models.py +3 -0
  84. mteb/models/model_implementations/clips_models.py +100 -0
  85. mteb/models/model_implementations/codefuse_models.py +162 -0
  86. mteb/models/model_implementations/codesage_models.py +15 -0
  87. mteb/models/model_implementations/cohere_models.py +8 -1
  88. mteb/models/model_implementations/cohere_v.py +5 -0
  89. mteb/models/model_implementations/colpali_models.py +14 -6
  90. mteb/models/model_implementations/colqwen_models.py +271 -1
  91. mteb/models/model_implementations/colsmol_models.py +2 -0
  92. mteb/models/model_implementations/conan_models.py +1 -0
  93. mteb/models/model_implementations/dino_models.py +171 -0
  94. mteb/models/model_implementations/e5_instruct.py +4 -0
  95. mteb/models/model_implementations/e5_models.py +12 -101
  96. mteb/models/model_implementations/e5_v.py +1 -0
  97. mteb/models/model_implementations/eagerworks_models.py +164 -0
  98. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  99. mteb/models/model_implementations/en_code_retriever.py +1 -0
  100. mteb/models/model_implementations/euler_models.py +32 -0
  101. mteb/models/model_implementations/evaclip_models.py +4 -0
  102. mteb/models/model_implementations/fa_models.py +58 -0
  103. mteb/models/model_implementations/facebookai.py +193 -0
  104. mteb/models/model_implementations/geogpt_models.py +1 -0
  105. mteb/models/model_implementations/gme_v_models.py +11 -5
  106. mteb/models/model_implementations/google_models.py +16 -5
  107. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
  108. mteb/models/model_implementations/gritlm_models.py +2 -0
  109. mteb/models/model_implementations/gte_models.py +78 -0
  110. mteb/models/model_implementations/hinvec_models.py +1 -0
  111. mteb/models/model_implementations/human.py +1 -0
  112. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  113. mteb/models/model_implementations/inf_models.py +2 -0
  114. mteb/models/model_implementations/jasper_models.py +255 -2
  115. mteb/models/model_implementations/jina_clip.py +1 -0
  116. mteb/models/model_implementations/jina_models.py +209 -5
  117. mteb/models/model_implementations/kalm_models.py +203 -25
  118. mteb/models/model_implementations/kblab.py +31 -0
  119. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  120. mteb/models/model_implementations/kfst.py +25 -0
  121. mteb/models/model_implementations/kowshik24_models.py +32 -0
  122. mteb/models/model_implementations/lens_models.py +2 -0
  123. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  124. mteb/models/model_implementations/linq_models.py +3 -2
  125. mteb/models/model_implementations/listconranker.py +1 -1
  126. mteb/models/model_implementations/llm2clip_models.py +3 -0
  127. mteb/models/model_implementations/llm2vec_models.py +8 -0
  128. mteb/models/model_implementations/mcinext_models.py +3 -0
  129. mteb/models/model_implementations/mdbr_models.py +2 -0
  130. mteb/models/model_implementations/misc_models.py +362 -0
  131. mteb/models/model_implementations/mme5_models.py +1 -0
  132. mteb/models/model_implementations/moco_models.py +11 -0
  133. mteb/models/model_implementations/mod_models.py +191 -0
  134. mteb/models/model_implementations/model2vec_models.py +13 -0
  135. mteb/models/model_implementations/moka_models.py +3 -0
  136. mteb/models/model_implementations/mxbai_models.py +9 -0
  137. mteb/models/model_implementations/nbailab.py +70 -0
  138. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  139. mteb/models/model_implementations/nomic_models.py +156 -4
  140. mteb/models/model_implementations/nomic_models_vision.py +7 -2
  141. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
  142. mteb/models/model_implementations/nvidia_models.py +4 -1
  143. mteb/models/model_implementations/octen_models.py +195 -0
  144. mteb/models/model_implementations/openai_models.py +20 -16
  145. mteb/models/model_implementations/openclip_models.py +24 -0
  146. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  147. mteb/models/model_implementations/ops_moa_models.py +4 -2
  148. mteb/models/model_implementations/pawan_models.py +39 -0
  149. mteb/models/model_implementations/piccolo_models.py +8 -0
  150. mteb/models/model_implementations/promptriever_models.py +8 -4
  151. mteb/models/model_implementations/pylate_models.py +37 -4
  152. mteb/models/model_implementations/qodo_models.py +2 -0
  153. mteb/models/model_implementations/qtack_models.py +1 -0
  154. mteb/models/model_implementations/qwen3_models.py +6 -3
  155. mteb/models/model_implementations/qzhou_models.py +3 -1
  156. mteb/models/model_implementations/random_baseline.py +16 -21
  157. mteb/models/model_implementations/rasgaard_models.py +34 -0
  158. mteb/models/model_implementations/reasonir_model.py +1 -0
  159. mteb/models/model_implementations/repllama_models.py +2 -0
  160. mteb/models/model_implementations/rerankers_custom.py +3 -3
  161. mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
  162. mteb/models/model_implementations/richinfoai_models.py +1 -0
  163. mteb/models/model_implementations/ru_sentence_models.py +51 -0
  164. mteb/models/model_implementations/ruri_models.py +322 -0
  165. mteb/models/model_implementations/salesforce_models.py +3 -0
  166. mteb/models/model_implementations/samilpwc_models.py +1 -0
  167. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  168. mteb/models/model_implementations/searchmap_models.py +1 -0
  169. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  170. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
  171. mteb/models/model_implementations/seed_models.py +1 -0
  172. mteb/models/model_implementations/sentence_transformers_models.py +57 -0
  173. mteb/models/model_implementations/shuu_model.py +32 -31
  174. mteb/models/model_implementations/siglip_models.py +10 -0
  175. mteb/models/model_implementations/sonar_models.py +1 -0
  176. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  177. mteb/models/model_implementations/stella_models.py +6 -0
  178. mteb/models/model_implementations/tarka_models.py +376 -0
  179. mteb/models/model_implementations/ua_sentence_models.py +10 -0
  180. mteb/models/model_implementations/uae_models.py +1 -0
  181. mteb/models/model_implementations/vdr_models.py +2 -0
  182. mteb/models/model_implementations/vi_vn_models.py +39 -0
  183. mteb/models/model_implementations/vista_models.py +2 -0
  184. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  185. mteb/models/model_implementations/voyage_models.py +15 -0
  186. mteb/models/model_implementations/voyage_v.py +8 -2
  187. mteb/models/model_implementations/xyz_models.py +1 -0
  188. mteb/models/model_implementations/youtu_models.py +1 -0
  189. mteb/models/model_implementations/yuan_models.py +34 -0
  190. mteb/models/model_implementations/yuan_models_en.py +58 -0
  191. mteb/models/model_meta.py +442 -22
  192. mteb/models/search_encoder_index/__init__.py +7 -0
  193. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  194. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  195. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  196. mteb/models/search_wrappers.py +165 -48
  197. mteb/models/sentence_transformer_wrapper.py +2 -7
  198. mteb/results/benchmark_results.py +88 -47
  199. mteb/results/model_result.py +11 -4
  200. mteb/results/task_result.py +37 -19
  201. mteb/similarity_functions.py +49 -0
  202. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  203. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  204. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  205. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  206. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  207. mteb/tasks/classification/ara/ajgt.py +1 -2
  208. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  209. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  210. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  211. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  212. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  213. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  214. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  215. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  216. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  217. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  218. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  219. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  220. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  221. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  222. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  223. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  224. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  225. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  226. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  227. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  228. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  229. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  230. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  231. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  232. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  233. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  234. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  235. mteb/tasks/classification/eng/news_classification.py +1 -2
  236. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  237. mteb/tasks/classification/eng/patent_classification.py +1 -2
  238. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  239. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  240. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  241. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  242. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  243. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  244. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  245. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  246. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  247. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  248. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  249. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  250. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  251. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  252. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  253. mteb/tasks/classification/est/estonian_valence.py +1 -2
  254. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  255. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  257. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  260. mteb/tasks/classification/heb/__init__.py +6 -1
  261. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  262. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  263. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  264. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  265. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  266. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  267. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  268. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  269. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  270. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  271. mteb/tasks/classification/kor/klue_tc.py +1 -2
  272. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  274. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  275. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  276. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  277. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  278. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  279. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  280. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  281. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  282. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  283. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  284. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  285. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  286. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  287. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  288. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  289. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  290. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  291. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  292. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  293. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  294. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  295. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  296. mteb/tasks/classification/pol/polish_classification.py +3 -6
  297. mteb/tasks/classification/ron/moroco.py +1 -2
  298. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  299. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  300. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  301. mteb/tasks/classification/rus/headline_classification.py +1 -2
  302. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  303. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  304. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  305. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  306. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  307. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  308. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  309. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  310. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  311. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  312. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  313. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  314. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  315. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  316. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  317. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  318. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  319. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  320. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  321. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  322. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  323. mteb/tasks/classification/tur/__init__.py +4 -0
  324. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  325. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  326. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  327. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  328. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  329. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  330. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  331. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  332. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  333. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  334. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  335. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  336. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  337. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  338. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  339. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  340. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  341. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  342. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  343. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  344. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  345. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  346. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  347. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  348. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  349. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  350. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  351. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  352. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  353. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  354. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  355. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  356. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  357. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  358. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  359. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  360. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  361. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  362. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  363. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  364. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  365. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  366. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  367. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  368. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  369. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  370. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  371. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  372. mteb/tasks/pair_classification/rus/terra.py +51 -25
  373. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  374. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  375. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  376. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  377. mteb/tasks/reranking/jpn/__init__.py +9 -1
  378. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  379. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  380. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  381. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  382. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  383. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  384. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  385. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  386. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  387. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  388. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  389. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  390. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  391. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  392. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  393. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  394. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  395. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  396. mteb/tasks/retrieval/kor/__init__.py +2 -1
  397. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  398. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  399. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  400. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  401. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  402. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  403. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  404. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  405. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  406. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  407. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  408. mteb/tasks/retrieval/nld/__init__.py +8 -4
  409. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  410. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  411. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  412. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  413. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  414. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  415. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  416. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  417. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  418. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  419. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  420. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  421. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  422. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  423. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  424. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  425. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  426. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  427. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  428. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  429. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  430. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  431. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  432. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  433. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  434. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  435. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  436. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  437. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  438. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  439. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  440. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  441. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  442. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  443. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  444. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  445. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  446. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  447. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  448. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  449. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  450. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  451. mteb/types/_encoder_io.py +7 -2
  452. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
  453. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
  454. mteb/models/model_implementations/nb_sbert.py +0 -25
  455. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
  456. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
  457. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
  458. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
mteb/__init__.py CHANGED
@@ -9,8 +9,10 @@ from mteb.filter_tasks import filter_tasks
9
9
  from mteb.get_tasks import get_task, get_tasks
10
10
  from mteb.load_results import load_results
11
11
  from mteb.models import (
12
+ CacheBackendProtocol,
12
13
  CrossEncoderProtocol,
13
14
  EncoderProtocol,
15
+ IndexEncoderSearchProtocol,
14
16
  SearchProtocol,
15
17
  SentenceTransformerEncoderWrapper,
16
18
  )
@@ -27,8 +29,10 @@ __all__ = [
27
29
  "AbsTask",
28
30
  "Benchmark",
29
31
  "BenchmarkResults",
32
+ "CacheBackendProtocol",
30
33
  "CrossEncoderProtocol",
31
34
  "EncoderProtocol",
35
+ "IndexEncoderSearchProtocol",
32
36
  "SearchProtocol",
33
37
  "SentenceTransformerEncoderWrapper",
34
38
  "TaskMetadata",
@@ -3,7 +3,7 @@ from collections.abc import Callable
3
3
  from typing import Any, cast
4
4
 
5
5
  import torch
6
- from datasets import Dataset
6
+ from datasets import Dataset, Image
7
7
  from torch.utils.data import DataLoader, default_collate
8
8
 
9
9
  from mteb.abstasks.task_metadata import TaskMetadata
@@ -22,12 +22,14 @@ logger = logging.getLogger(__name__)
22
22
  def _create_dataloader_from_texts(
23
23
  text: list[str],
24
24
  batch_size: int = 32,
25
+ **kwargs: dict[str, Any],
25
26
  ) -> DataLoader[TextInput]:
26
27
  """Create a dataloader from a list of text.
27
28
 
28
29
  Args:
29
30
  text: A list of text to create a dataloader from.
30
31
  batch_size: Batch size for the dataloader.
32
+ kwargs: Not used, present catching extra arguments.
31
33
 
32
34
  Returns:
33
35
  A dataloader with the text.
@@ -244,14 +246,15 @@ def _prepare_image_dataset(
244
246
  transform: Callable[[Any], Any] | None = None,
245
247
  ) -> Dataset:
246
248
  """Prepare the image dataset by converting images to RGB and applying transformations."""
247
- # If the dataset uses a different column name for images, rename it to "image".
248
249
  if (
249
250
  image_column_name
250
251
  and image_column_name in dataset.column_names
251
252
  and "image" not in dataset.column_names
252
253
  ):
253
254
  dataset = dataset.rename_column(image_column_name, "image")
254
- # Map the conversion function over the dataset.
255
+ # don't process image if it's already in the correct format
256
+ if isinstance(dataset.features["image"], Image):
257
+ return dataset
255
258
  return dataset.map(
256
259
  _convert_images_to_rgb,
257
260
  fn_kwargs={"image_col_name": "image", "transform": transform},
@@ -12,6 +12,7 @@ from mteb._create_dataloaders import create_dataloader
12
12
  from mteb.abstasks.task_metadata import TaskMetadata
13
13
  from mteb.models import EncoderProtocol
14
14
  from mteb.similarity_functions import compute_pairwise_similarity
15
+ from mteb.types import PromptType
15
16
 
16
17
  from .evaluator import Evaluator
17
18
 
@@ -42,22 +43,18 @@ class AnySTSEvaluator(Evaluator):
42
43
  task_metadata: TaskMetadata,
43
44
  hf_split: str,
44
45
  hf_subset: str,
46
+ input1_prompt_type: PromptType | None,
47
+ input2_prompt_type: PromptType | None,
45
48
  **kwargs,
46
49
  ) -> None:
47
50
  super().__init__(**kwargs)
48
- self.first_column = create_dataloader(
49
- dataset,
50
- task_metadata,
51
- input_column=sentences_column_names[0],
52
- )
53
- self.second_column = create_dataloader(
54
- dataset,
55
- task_metadata,
56
- input_column=sentences_column_names[1],
57
- )
51
+ self.dataset = dataset
52
+ self.input_columns = sentences_column_names
58
53
  self.task_metadata = task_metadata
59
54
  self.hf_split = hf_split
60
55
  self.hf_subset = hf_subset
56
+ self.input1_prompt_type = input1_prompt_type
57
+ self.input2_prompt_type = input2_prompt_type
61
58
 
62
59
  def __call__(
63
60
  self,
@@ -67,19 +64,31 @@ class AnySTSEvaluator(Evaluator):
67
64
  ) -> STSEvaluatorScores:
68
65
  logger.info("Running semantic similarity - Encoding samples (1/2)")
69
66
  embeddings1 = model.encode(
70
- self.first_column,
67
+ create_dataloader(
68
+ self.dataset,
69
+ self.task_metadata,
70
+ input_column=self.input_columns[0],
71
+ **encode_kwargs,
72
+ ),
71
73
  task_metadata=self.task_metadata,
72
74
  hf_split=self.hf_split,
73
75
  hf_subset=self.hf_subset,
76
+ prompt_type=self.input1_prompt_type,
74
77
  **encode_kwargs,
75
78
  )
76
79
 
77
80
  logger.info("Running semantic similarity - Encoding samples (2/2)...")
78
81
  embeddings2 = model.encode(
79
- self.second_column,
82
+ create_dataloader(
83
+ self.dataset,
84
+ self.task_metadata,
85
+ input_column=self.input_columns[1],
86
+ **encode_kwargs,
87
+ ),
80
88
  task_metadata=self.task_metadata,
81
89
  hf_split=self.hf_split,
82
90
  hf_subset=self.hf_subset,
91
+ prompt_type=self.input2_prompt_type,
83
92
  **encode_kwargs,
84
93
  )
85
94
 
@@ -0,0 +1,54 @@
1
+ import numpy as np
2
+
3
+
4
+ def hamming_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
5
+ """Compute the Hamming score (a.k.a. label-based accuracy) for multilabel classification.
6
+
7
+ The Hamming score is the fraction of labels that are correctly predicted for each sample,
8
+ averaged over all samples. For samples where both y_true and y_pred have no labels,
9
+ the score is 1.0 (perfect agreement).
10
+
11
+ Args:
12
+ y_true: Binary matrix of true labels with shape (n_samples, n_labels)
13
+ y_pred: Binary matrix of predicted labels with shape (n_samples, n_labels)
14
+
15
+ Returns:
16
+ float: Hamming score between 0.0 and 1.0
17
+
18
+ Raises:
19
+ ValueError: If inputs are invalid or have incompatible shapes
20
+ TypeError: If inputs cannot be converted to numpy arrays
21
+ """
22
+ y_true = np.asarray(y_true)
23
+ y_pred = np.asarray(y_pred)
24
+
25
+ # Check shapes
26
+ if y_true.shape != y_pred.shape:
27
+ raise ValueError(
28
+ f"Shape mismatch: y_true {y_true.shape} != y_pred {y_pred.shape}"
29
+ )
30
+
31
+ # Check if arrays are empty
32
+ if y_true.size == 0:
33
+ raise ValueError("Input arrays cannot be empty")
34
+
35
+ # Ensure 2D arrays
36
+ if y_true.ndim != 2:
37
+ raise ValueError(f"Arrays must be 2D, got {y_true.ndim}D")
38
+
39
+ # Check for binary values
40
+ if not (np.all(np.isin(y_true, [0, 1])) and np.all(np.isin(y_pred, [0, 1]))):
41
+ raise ValueError("Arrays must contain only binary values (0 and 1)")
42
+
43
+ # Convert to boolean for bitwise operations
44
+ y_true_bool = y_true.astype(bool)
45
+ y_pred_bool = y_pred.astype(bool)
46
+
47
+ # Calculate intersection and union for each sample
48
+ intersection = (y_true_bool & y_pred_bool).sum(axis=1)
49
+ union = (y_true_bool | y_pred_bool).sum(axis=1)
50
+
51
+ # Handle division by zero: when union is 0, both are all zeros, so score is 1.0
52
+ scores = np.where(union == 0, 1.0, intersection / union)
53
+
54
+ return float(scores.mean())
@@ -44,7 +44,7 @@ class ClusteringEvaluator(Evaluator):
44
44
  self.dataset,
45
45
  self.task_metadata,
46
46
  input_column=self.input_column_name,
47
- batch_size=encode_kwargs["batch_size"],
47
+ **encode_kwargs,
48
48
  )
49
49
 
50
50
  logger.info("Running clustering - Encoding samples...")
@@ -1,10 +1,11 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
7
  import torch.nn.functional as F
6
8
  from datasets import Dataset
7
- from PIL.Image import Image
8
9
  from torch.utils.data import DataLoader
9
10
 
10
11
  from mteb._create_dataloaders import (
@@ -15,6 +16,10 @@ from mteb._requires_package import requires_image_dependencies
15
16
  from mteb.abstasks.task_metadata import TaskMetadata
16
17
  from mteb.models.models_protocols import EncoderProtocol
17
18
 
19
+ if TYPE_CHECKING:
20
+ from PIL.Image import Image
21
+
22
+
18
23
  logger = logging.getLogger(__name__)
19
24
 
20
25
 
@@ -103,7 +108,7 @@ class ImageTextPairClassificationEvaluator(Evaluator):
103
108
  text_embeddings = model.encode(
104
109
  DataLoader(
105
110
  Dataset.from_dict({"text": texts}),
106
- batch_size=encode_kwargs["batch_size"],
111
+ **encode_kwargs,
107
112
  ),
108
113
  task_metadata=self.task_metadata,
109
114
  hf_subset=self.hf_subset,
@@ -122,8 +127,8 @@ class ImageTextPairClassificationEvaluator(Evaluator):
122
127
  image_embeddings = model.encode(
123
128
  DataLoader(
124
129
  CustomImageDataset(images),
125
- batch_size=encode_kwargs["batch_size"],
126
130
  collate_fn=lambda x: {"image": [item["image"] for item in x]},
131
+ **encode_kwargs,
127
132
  ),
128
133
  task_metadata=self.task_metadata,
129
134
  hf_subset=self.hf_subset,
@@ -14,6 +14,7 @@ from mteb._evaluators.evaluator import Evaluator
14
14
  from mteb.abstasks.task_metadata import TaskMetadata
15
15
  from mteb.models import EncoderProtocol
16
16
  from mteb.similarity_functions import compute_pairwise_similarity
17
+ from mteb.types import PromptType
17
18
 
18
19
  logger = logging.getLogger(__name__)
19
20
 
@@ -60,6 +61,8 @@ class PairClassificationEvaluator(Evaluator):
60
61
  task_metadata: TaskMetadata,
61
62
  hf_split: str,
62
63
  hf_subset: str,
64
+ input1_prompt_type: PromptType | None,
65
+ input2_prompt_type: PromptType | None,
63
66
  **kwargs,
64
67
  ) -> None:
65
68
  super().__init__(**kwargs)
@@ -69,6 +72,8 @@ class PairClassificationEvaluator(Evaluator):
69
72
  self.task_metadata = task_metadata
70
73
  self.hf_split = hf_split
71
74
  self.hf_subset = hf_subset
75
+ self.input1_prompt_type = input1_prompt_type
76
+ self.input2_prompt_type = input2_prompt_type
72
77
 
73
78
  if len(self.dataset[self.input1_column_name]) != len(
74
79
  self.dataset[self.input2_column_name]
@@ -82,47 +87,34 @@ class PairClassificationEvaluator(Evaluator):
82
87
  model: EncoderProtocol,
83
88
  encode_kwargs: dict[str, Any],
84
89
  ) -> PairClassificationDistances:
85
- logger.info("Running pair classification - Encoding inputs...")
86
- if self.task_metadata.modalities == ["text"]:
87
- # datasets v4 will pass column objects, so we need to extract the text
88
- all_sentences = (
89
- self.dataset[self.input1_column_name][:]
90
- + self.dataset[self.input2_column_name][:]
91
- )
92
- len_sentences1 = len(self.dataset[self.input1_column_name])
93
- embeddings = self._encode_unique_texts(
94
- all_sentences,
95
- model,
96
- task_metadata=self.task_metadata,
97
- hf_split=self.hf_split,
98
- hf_subset=self.hf_subset,
99
- **encode_kwargs,
100
- )
101
- embeddings1 = embeddings[:len_sentences1]
102
- embeddings2 = embeddings[len_sentences1:]
103
- else:
104
- embeddings1 = model.encode(
105
- create_dataloader(
106
- self.dataset,
107
- task_metadata=self.task_metadata,
108
- input_column=self.input1_column_name,
109
- ),
90
+ logger.info("Running pair classification - Encoding samples (1/2)")
91
+ embeddings1 = model.encode(
92
+ create_dataloader(
93
+ self.dataset,
110
94
  task_metadata=self.task_metadata,
111
- hf_split=self.hf_split,
112
- hf_subset=self.hf_subset,
95
+ input_column=self.input1_column_name,
113
96
  **encode_kwargs,
114
- )
115
- embeddings2 = model.encode(
116
- create_dataloader(
117
- self.dataset,
118
- task_metadata=self.task_metadata,
119
- input_column=self.input2_column_name,
120
- ),
97
+ ),
98
+ task_metadata=self.task_metadata,
99
+ hf_split=self.hf_split,
100
+ hf_subset=self.hf_subset,
101
+ prompt_type=self.input1_prompt_type,
102
+ **encode_kwargs,
103
+ )
104
+ logger.info("Running pair classification - Encoding samples (2/2)")
105
+ embeddings2 = model.encode(
106
+ create_dataloader(
107
+ self.dataset,
121
108
  task_metadata=self.task_metadata,
122
- hf_split=self.hf_split,
123
- hf_subset=self.hf_subset,
109
+ input_column=self.input2_column_name,
124
110
  **encode_kwargs,
125
- )
111
+ ),
112
+ task_metadata=self.task_metadata,
113
+ hf_split=self.hf_split,
114
+ hf_subset=self.hf_subset,
115
+ prompt_type=self.input2_prompt_type,
116
+ **encode_kwargs,
117
+ )
126
118
 
127
119
  logger.info("Running pair classification - Evaluating pair similarity...")
128
120
  cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
@@ -168,7 +160,7 @@ class PairClassificationEvaluator(Evaluator):
168
160
  )
169
161
  all_unique_texts_embs = np.asarray(
170
162
  model.encode(
171
- _create_dataloader_from_texts(all_unique_texts),
163
+ _create_dataloader_from_texts(all_unique_texts, **encode_kwargs),
172
164
  task_metadata=task_metadata,
173
165
  hf_split=hf_split,
174
166
  hf_subset=hf_subset,
@@ -6,7 +6,7 @@ from datasets import Dataset
6
6
  from torch.utils.data import DataLoader
7
7
  from typing_extensions import Self
8
8
 
9
- from mteb._create_dataloaders import _create_image_dataloader
9
+ from mteb._create_dataloaders import create_dataloader
10
10
  from mteb.abstasks.task_metadata import TaskMetadata
11
11
  from mteb.models import EncoderProtocol
12
12
  from mteb.types import BatchedInput
@@ -50,33 +50,20 @@ class SklearnEvaluator(Evaluator):
50
50
  self.evaluator_model = evaluator_model
51
51
 
52
52
  def create_dataloaders(
53
- self, batch_size: int
53
+ self, encode_kwargs: dict[str, Any]
54
54
  ) -> tuple[DataLoader[BatchedInput], DataLoader[BatchedInput]]:
55
- if self.task_metadata.modalities == ["image"]:
56
- dataloader_train = _create_image_dataloader(
57
- self.train_dataset,
58
- image_column_name=self.values_column_name,
59
- batch_size=batch_size,
60
- )
61
- dataloader_test = _create_image_dataloader(
62
- self.eval_dataset,
63
- image_column_name=self.values_column_name,
64
- batch_size=batch_size,
65
- )
66
- elif self.task_metadata.modalities == ["text"]:
67
- if self.values_column_name != "text":
68
- self.train_dataset = self.train_dataset.rename_column(
69
- self.values_column_name, "text"
70
- )
71
- self.eval_dataset = self.eval_dataset.rename_column(
72
- self.values_column_name, "text"
73
- )
74
- dataloader_train = DataLoader(self.train_dataset)
75
- dataloader_test = DataLoader(self.eval_dataset)
76
- else:
77
- raise ValueError(
78
- "ClassificationEvaluator only supports image and text modalities."
79
- )
55
+ dataloader_train = create_dataloader(
56
+ self.train_dataset,
57
+ self.task_metadata,
58
+ input_column=self.values_column_name,
59
+ **encode_kwargs,
60
+ )
61
+ dataloader_test = create_dataloader(
62
+ self.eval_dataset,
63
+ self.task_metadata,
64
+ input_column=self.values_column_name,
65
+ **encode_kwargs,
66
+ )
80
67
  return dataloader_train, dataloader_test
81
68
 
82
69
  def __call__( # type: ignore[override]
@@ -98,7 +85,7 @@ class SklearnEvaluator(Evaluator):
98
85
 
99
86
  """
100
87
  dataloader_train, dataloader_test = self.create_dataloaders(
101
- batch_size=encode_kwargs["batch_size"]
88
+ encode_kwargs=encode_kwargs,
102
89
  )
103
90
 
104
91
  logger.info("Running - Encoding samples...")
@@ -46,7 +46,10 @@ class BitextMiningEvaluator(Evaluator):
46
46
 
47
47
  embeddings = {}
48
48
  for sub in tqdm(subsets):
49
- dataloader = _create_dataloader_from_texts(self.sentences[sub])
49
+ dataloader = _create_dataloader_from_texts(
50
+ self.sentences[sub],
51
+ **encode_kwargs,
52
+ )
50
53
  embeddings[sub] = model.encode(
51
54
  dataloader,
52
55
  task_metadata=self.task_metadata,
@@ -109,7 +109,8 @@ class SummarizationEvaluator(Evaluator):
109
109
  summary
110
110
  for human_summaries in self.human_summaries
111
111
  for summary in human_summaries
112
- ]
112
+ ],
113
+ **encode_kwargs,
113
114
  ),
114
115
  task_metadata=self.task_metadata,
115
116
  hf_subset=self.hf_subset,
@@ -124,7 +125,8 @@ class SummarizationEvaluator(Evaluator):
124
125
  summary
125
126
  for machine_summaries in self.machine_summaries
126
127
  for summary in machine_summaries
127
- ]
128
+ ],
129
+ **encode_kwargs,
128
130
  ),
129
131
  task_metadata=self.task_metadata,
130
132
  hf_subset=self.hf_subset,
@@ -42,14 +42,14 @@ class ZeroShotClassificationEvaluator(Evaluator):
42
42
  ) -> Array:
43
43
  dataloader = create_dataloader(
44
44
  self.dataset,
45
- batch_size=encode_kwargs["batch_size"],
46
45
  input_column=self.input_column_name,
47
46
  task_metadata=self.task_metadata,
47
+ **encode_kwargs,
48
48
  )
49
49
 
50
50
  logger.info("Running zero-shot classification - Encoding labels...")
51
51
  text_label_embeddings = model.encode(
52
- _create_dataloader_from_texts(self.candidate_labels),
52
+ _create_dataloader_from_texts(self.candidate_labels, **encode_kwargs),
53
53
  task_metadata=self.task_metadata,
54
54
  hf_subset=self.hf_subset,
55
55
  hf_split=self.hf_split,
File without changes
@@ -0,0 +1,125 @@
1
+ """Simplified version of https://gist.github.com/AlexeyVatolin/ea3adc21aa7a767603ff393b22085adc from https://github.com/embeddings-benchmark/mteb/pull/2900"""
2
+
3
+ import logging
4
+
5
+ import datasets
6
+ import pandas as pd
7
+ from datasets import Dataset, DatasetDict
8
+
9
+ from mteb import TaskMetadata
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def deduplicate(dataset: Dataset, input_column: str) -> Dataset:
15
+ """Remove duplicate texts, keeping the first occurrence."""
16
+ unique_texts = set()
17
+ indices_to_keep = []
18
+ for i, text in enumerate(dataset[input_column]):
19
+ text = text.strip()
20
+ if text not in unique_texts:
21
+ unique_texts.add(text)
22
+ indices_to_keep.append(i)
23
+
24
+ logger.info(
25
+ f"[deduplicate] removed={len(dataset) - len(indices_to_keep)}/{len(dataset)}"
26
+ )
27
+ return dataset.select(indices_to_keep)
28
+
29
+
30
+ def filter_empty(dataset: Dataset, input_column: str) -> Dataset:
31
+ """Filter out empty or whitespace-only examples."""
32
+ before = len(dataset)
33
+ ds = dataset.filter(lambda x: len(x[input_column].strip()) > 0)
34
+ logger.info(f"[filter_empty] removed={before - len(ds)}/{before}")
35
+ return ds
36
+
37
+
38
+ def filter_train_leakage(
39
+ train_dataset: Dataset, test_dataset: Dataset, input_column: str
40
+ ) -> Dataset:
41
+ """Remove test examples that appear in training."""
42
+ train_texts = set(train_dataset[input_column])
43
+ before = len(test_dataset)
44
+ indices = [
45
+ i
46
+ for i, text in enumerate(test_dataset[input_column])
47
+ if text not in train_texts
48
+ ]
49
+ logger.info(f"[filter_train_leakage] removed={before - len(indices)}/{before}")
50
+ return test_dataset.select(indices)
51
+
52
+
53
+ def filter_unclear_label(
54
+ dataset_dict: DatasetDict, input_column: str, label_column: str
55
+ ) -> DatasetDict:
56
+ """Remove examples where the same text appears with multiple different labels."""
57
+ normalized: dict[str, set[str | tuple[str, ...]]] = {}
58
+ logger.debug("[filter_controversial] scanning dataset for label conflicts...")
59
+
60
+ for split, ds in dataset_dict.items():
61
+ for text, label in zip(ds[input_column], ds[label_column]):
62
+ key = text.strip().lower()
63
+ normalized.setdefault(key, set()).add(
64
+ label if isinstance(label, (str, int, float)) else tuple(label)
65
+ )
66
+
67
+ bad_texts = {t for t, labels in normalized.items() if len(labels) > 1}
68
+ logger.info(f"[filter_controversial] Removing {len(bad_texts)} conflicting texts")
69
+
70
+ new_dict = {}
71
+ for split, ds in dataset_dict.items():
72
+ before = len(ds)
73
+ filtered = ds.filter(lambda x: x[input_column].strip().lower() not in bad_texts)
74
+ logger.debug(
75
+ f"[filter_controversial:{split}] removed={before - len(filtered)}/{before}"
76
+ )
77
+ new_dict[split] = filtered
78
+
79
+ return DatasetDict(new_dict)
80
+
81
+
82
+ def filter_short(dataset: Dataset, input_column: str, min_words: int = 3) -> Dataset:
83
+ """Filter out texts with fewer than `min_words`."""
84
+ before = len(dataset)
85
+ ds = dataset.filter(lambda x: len(x[input_column].strip().split()) >= min_words)
86
+ logger.debug(f"[filter_short] removed={before - len(ds)}/{before}")
87
+ return ds
88
+
89
+
90
+ def split_train_test(
91
+ ds: DatasetDict,
92
+ metadata: TaskMetadata,
93
+ train_split: str,
94
+ label_column: str,
95
+ ) -> DatasetDict:
96
+ if train_split in ds and metadata.eval_splits == train_split:
97
+ before = len(ds[train_split])
98
+ logger.info(
99
+ f"[split_train_test] eval_splits == train_split; performing split on {before} examples"
100
+ )
101
+ ds[train_split] = ds[train_split].cast_column(
102
+ label_column,
103
+ datasets.ClassLabel(names=list(set(ds[train_split][label_column]))),
104
+ )
105
+ label_counts = pd.Series(ds[train_split][label_column]).value_counts()
106
+ one_sample_labels = set(label_counts[label_counts == 1].index.tolist())
107
+
108
+ if one_sample_labels:
109
+ logger.info(
110
+ f"[split_train_test] Removing {len(one_sample_labels)} labels with only one instance"
111
+ )
112
+ ds[train_split] = ds[train_split].filter(
113
+ lambda x: x[label_column] not in one_sample_labels
114
+ )
115
+
116
+ splits = ds[train_split].train_test_split(
117
+ test_size=min(2048, before // 2), seed=42, stratify_by_column=label_column
118
+ )
119
+ ds = DatasetDict({train_split: splits[train_split], "test": splits["test"]})
120
+ metadata.eval_splits = ["test"]
121
+ logger.info(
122
+ f"[split_train_test] Train size={len(ds[train_split])}, Test size={len(ds['test'])}"
123
+ )
124
+
125
+ return ds