mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,15 @@
1
1
  import logging
2
- from typing import Any, Protocol
2
+ from typing import Any, Protocol, cast
3
3
 
4
4
  import numpy as np
5
5
  from datasets import Dataset
6
6
  from torch.utils.data import DataLoader
7
7
  from typing_extensions import Self
8
8
 
9
- from mteb._create_dataloaders import _create_image_dataloader
9
+ from mteb._create_dataloaders import create_dataloader
10
10
  from mteb.abstasks.task_metadata import TaskMetadata
11
11
  from mteb.models import EncoderProtocol
12
- from mteb.types import BatchedInput
12
+ from mteb.types import Array, BatchedInput, EncodeKwargs
13
13
 
14
14
  from .evaluator import Evaluator
15
15
 
@@ -17,11 +17,11 @@ logger = logging.getLogger(__name__)
17
17
 
18
18
 
19
19
  class SklearnModelProtocol(Protocol):
20
- def fit(self, X: np.ndarray, y: np.ndarray | list[int]) -> None: ... # noqa: N803
21
- def predict(self, X: np.ndarray) -> np.ndarray: ... # noqa: N803
20
+ def fit(self, X: Array, y: np.ndarray | list[int]) -> None: ... # noqa: N803
21
+ def predict(self, X: Array) -> np.ndarray: ... # noqa: N803
22
22
  def get_params(self) -> dict[str, Any]: ...
23
- def set_params(self, **kwargs: dict[str, Any]) -> Self: ...
24
- def score(self, X: np.ndarray, y: np.ndarray | list[int]) -> float: ... # noqa: N803
23
+ def set_params(self, random_state: int, **kwargs: dict[str, Any]) -> Self: ...
24
+ def score(self, X: Array, y: np.ndarray | list[int]) -> float: ... # noqa: N803
25
25
 
26
26
 
27
27
  class SklearnEvaluator(Evaluator):
@@ -50,42 +50,29 @@ class SklearnEvaluator(Evaluator):
50
50
  self.evaluator_model = evaluator_model
51
51
 
52
52
  def create_dataloaders(
53
- self, batch_size: int
53
+ self, encode_kwargs: EncodeKwargs
54
54
  ) -> tuple[DataLoader[BatchedInput], DataLoader[BatchedInput]]:
55
- if self.task_metadata.modalities == ["image"]:
56
- dataloader_train = _create_image_dataloader(
57
- self.train_dataset,
58
- image_column_name=self.values_column_name,
59
- batch_size=batch_size,
60
- )
61
- dataloader_test = _create_image_dataloader(
62
- self.eval_dataset,
63
- image_column_name=self.values_column_name,
64
- batch_size=batch_size,
65
- )
66
- elif self.task_metadata.modalities == ["text"]:
67
- if self.values_column_name != "text":
68
- self.train_dataset = self.train_dataset.rename_column(
69
- self.values_column_name, "text"
70
- )
71
- self.eval_dataset = self.eval_dataset.rename_column(
72
- self.values_column_name, "text"
73
- )
74
- dataloader_train = DataLoader(self.train_dataset)
75
- dataloader_test = DataLoader(self.eval_dataset)
76
- else:
77
- raise ValueError(
78
- "ClassificationEvaluator only supports image and text modalities."
79
- )
55
+ dataloader_train = create_dataloader(
56
+ self.train_dataset,
57
+ self.task_metadata,
58
+ input_column=self.values_column_name,
59
+ **encode_kwargs,
60
+ )
61
+ dataloader_test = create_dataloader(
62
+ self.eval_dataset,
63
+ self.task_metadata,
64
+ input_column=self.values_column_name,
65
+ **encode_kwargs,
66
+ )
80
67
  return dataloader_train, dataloader_test
81
68
 
82
69
  def __call__( # type: ignore[override]
83
70
  self,
84
71
  model: EncoderProtocol,
85
72
  *,
86
- encode_kwargs: dict[str, Any],
87
- test_cache: np.ndarray | None = None,
88
- ) -> tuple[np.ndarray, np.ndarray]:
73
+ encode_kwargs: EncodeKwargs,
74
+ test_cache: Array | None = None,
75
+ ) -> tuple[np.ndarray, Array]:
89
76
  """Classification evaluation by training a sklearn classifier on the embeddings of the training set and evaluating on the embeddings of the test set.
90
77
 
91
78
  Args:
@@ -98,7 +85,7 @@ class SklearnEvaluator(Evaluator):
98
85
 
99
86
  """
100
87
  dataloader_train, dataloader_test = self.create_dataloaders(
101
- batch_size=encode_kwargs["batch_size"]
88
+ encode_kwargs=encode_kwargs,
102
89
  )
103
90
 
104
91
  logger.info("Running - Encoding samples...")
@@ -117,6 +104,7 @@ class SklearnEvaluator(Evaluator):
117
104
  hf_subset=self.hf_subset,
118
105
  **encode_kwargs,
119
106
  )
107
+ test_cache = cast(Array, test_cache)
120
108
 
121
109
  logger.info("Running - Fitting classifier...")
122
110
  y_train = self.train_dataset[self.label_column_name]
@@ -1,7 +1,5 @@
1
1
  import logging
2
- from typing import Any
3
2
 
4
- import numpy as np
5
3
  import torch
6
4
  from datasets import Dataset
7
5
  from tqdm.auto import tqdm
@@ -10,6 +8,7 @@ from mteb._create_dataloaders import _create_dataloader_from_texts
10
8
  from mteb._evaluators.evaluator import Evaluator
11
9
  from mteb.abstasks.task_metadata import TaskMetadata
12
10
  from mteb.models import EncoderProtocol
11
+ from mteb.types import Array, EncodeKwargs
13
12
 
14
13
  logger = logging.getLogger(__name__)
15
14
 
@@ -33,7 +32,10 @@ class BitextMiningEvaluator(Evaluator):
33
32
  self.task_metadata = task_metadata
34
33
 
35
34
  def __call__(
36
- self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
35
+ self,
36
+ model: EncoderProtocol,
37
+ *,
38
+ encode_kwargs: EncodeKwargs,
37
39
  ) -> dict[str, list[dict[str, float]]]:
38
40
  pair_elements = {p for pair in self.pairs for p in pair}
39
41
  if isinstance(self.sentences, Dataset):
@@ -46,7 +48,10 @@ class BitextMiningEvaluator(Evaluator):
46
48
 
47
49
  embeddings = {}
48
50
  for sub in tqdm(subsets):
49
- dataloader = _create_dataloader_from_texts(self.sentences[sub])
51
+ dataloader = _create_dataloader_from_texts(
52
+ self.sentences[sub],
53
+ **encode_kwargs,
54
+ )
50
55
  embeddings[sub] = model.encode(
51
56
  dataloader,
52
57
  task_metadata=self.task_metadata,
@@ -66,11 +71,11 @@ class BitextMiningEvaluator(Evaluator):
66
71
 
67
72
  def _similarity_search(
68
73
  self,
69
- query_embeddings: np.ndarray,
70
- corpus_embeddings: np.ndarray,
74
+ query_embeddings: Array,
75
+ corpus_embeddings: Array,
71
76
  model: EncoderProtocol,
72
77
  query_chunk_size: int = 100,
73
- corpus_chunk_size: int = 500000,
78
+ corpus_chunk_size: int = 500_000,
74
79
  ) -> list[dict[str, float]]:
75
80
  """This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings.
76
81
 
@@ -101,13 +106,15 @@ class BitextMiningEvaluator(Evaluator):
101
106
  ):
102
107
  query_embeddings = query_embeddings.to(corpus_embeddings.device)
103
108
 
104
- queries_result_list = [[] for _ in range(len(query_embeddings))]
109
+ queries_result_list: list[list[dict[str, float]]] = [
110
+ [] for _ in range(len(query_embeddings))
111
+ ]
105
112
 
106
113
  for query_start_idx in range(0, len(query_embeddings), query_chunk_size):
107
114
  # Iterate over chunks of the corpus
108
115
  for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
109
116
  # Compute cosine similarities
110
- similarity_scores = model.similarity( # type: ignore
117
+ similarity_scores = model.similarity(
111
118
  query_embeddings[
112
119
  query_start_idx : query_start_idx + query_chunk_size
113
120
  ],
@@ -117,15 +124,17 @@ class BitextMiningEvaluator(Evaluator):
117
124
  )
118
125
 
119
126
  # Get top-k scores
120
- cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
121
- torch.tensor(similarity_scores),
122
- 1,
123
- dim=1,
124
- largest=True,
125
- sorted=False,
127
+ cos_scores_top_k_values_tensor, cos_scores_top_k_idx_tensor = (
128
+ torch.topk(
129
+ torch.tensor(similarity_scores),
130
+ 1,
131
+ dim=1,
132
+ largest=True,
133
+ sorted=False,
134
+ )
126
135
  )
127
- cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
128
- cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
136
+ cos_scores_top_k_values = cos_scores_top_k_values_tensor.cpu().tolist()
137
+ cos_scores_top_k_idx = cos_scores_top_k_idx_tensor.cpu().tolist()
129
138
 
130
139
  for query_itr in range(len(similarity_scores)):
131
140
  for sub_corpus_id, score in zip(
@@ -138,11 +147,14 @@ class BitextMiningEvaluator(Evaluator):
138
147
  {"corpus_id": corpus_id, "score": score}
139
148
  )
140
149
 
150
+ result_queries_list: list[dict[str, float]] = [
151
+ {} for _ in range(len(query_embeddings))
152
+ ]
141
153
  # Sort and strip to top_k results
142
154
  for idx in range(len(queries_result_list)):
143
155
  queries_result_list[idx] = sorted(
144
156
  queries_result_list[idx], key=lambda x: x["score"], reverse=True
145
157
  )
146
- queries_result_list[idx] = queries_result_list[idx][0]
158
+ result_queries_list[idx] = queries_result_list[idx][0]
147
159
 
148
- return queries_result_list
160
+ return result_queries_list
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  import sys
3
- from typing import Any, TypedDict
3
+ from typing import TypedDict
4
4
 
5
5
  import numpy as np
6
6
  import torch
@@ -12,6 +12,7 @@ from mteb._evaluators.evaluator import Evaluator
12
12
  from mteb.abstasks.task_metadata import TaskMetadata
13
13
  from mteb.models import EncoderProtocol
14
14
  from mteb.similarity_functions import cos_sim, dot_score
15
+ from mteb.types import EncodeKwargs
15
16
 
16
17
  # if later than python 3.13 use typing module
17
18
  if sys.version_info >= (3, 13):
@@ -94,7 +95,7 @@ class SummarizationEvaluator(Evaluator):
94
95
  self,
95
96
  model: EncoderProtocol,
96
97
  *,
97
- encode_kwargs: dict[str, Any],
98
+ encode_kwargs: EncodeKwargs,
98
99
  ) -> SummarizationDistances:
99
100
  # Get the human & machine summaries for the text in one go for all
100
101
  human_lens = [len(human_summaries) for human_summaries in self.human_summaries]
@@ -109,7 +110,8 @@ class SummarizationEvaluator(Evaluator):
109
110
  summary
110
111
  for human_summaries in self.human_summaries
111
112
  for summary in human_summaries
112
- ]
113
+ ],
114
+ **encode_kwargs,
113
115
  ),
114
116
  task_metadata=self.task_metadata,
115
117
  hf_subset=self.hf_subset,
@@ -124,7 +126,8 @@ class SummarizationEvaluator(Evaluator):
124
126
  summary
125
127
  for machine_summaries in self.machine_summaries
126
128
  for summary in machine_summaries
127
- ]
129
+ ],
130
+ **encode_kwargs,
128
131
  ),
129
132
  task_metadata=self.task_metadata,
130
133
  hf_subset=self.hf_subset,
@@ -133,10 +136,10 @@ class SummarizationEvaluator(Evaluator):
133
136
  )
134
137
 
135
138
  # Split the embeddings into the original human & machine summaries
136
- embs_human_summaries_all = np.split(
139
+ embs_human_summaries_all_split = np.split(
137
140
  embs_human_summaries_all, np.cumsum(human_lens)[:-1]
138
141
  )
139
- embs_machine_summaries_all = np.split(
142
+ embs_machine_summaries_all_split = np.split(
140
143
  embs_machine_summaries_all, np.cumsum(machine_lens)[:-1]
141
144
  )
142
145
 
@@ -146,7 +149,9 @@ class SummarizationEvaluator(Evaluator):
146
149
  all_human_scores = []
147
150
 
148
151
  for i, (embs_human_summaries, embs_machine_summaries) in tqdm(
149
- enumerate(zip(embs_human_summaries_all, embs_machine_summaries_all)),
152
+ enumerate(
153
+ zip(embs_human_summaries_all_split, embs_machine_summaries_all_split)
154
+ ),
150
155
  desc="Scoring",
151
156
  total=len(self.human_summaries),
152
157
  ):
@@ -162,7 +167,7 @@ class SummarizationEvaluator(Evaluator):
162
167
  dot_scores = dot_score(emb_machine_summary, embs_human_summaries)
163
168
 
164
169
  _sim_score = [
165
- float(model.similarity(emb_machine_summary, emb_human_summary)) # type: ignore
170
+ float(model.similarity(emb_machine_summary, emb_human_summary))
166
171
  for emb_human_summary in embs_human_summaries
167
172
  ]
168
173
  sim_score = torch.tensor(_sim_score)
@@ -214,17 +219,19 @@ class SummarizationEvaluator(Evaluator):
214
219
  strict=True,
215
220
  ):
216
221
  cosine_spearman_scores.append(
217
- spearmanr(human_scores, cosine_pred_scores).statistic
222
+ float(spearmanr(human_scores, cosine_pred_scores).statistic)
218
223
  )
219
224
  cosine_pearson_scores.append(
220
- pearsonr(human_scores, cosine_pred_scores).statistic
225
+ float(pearsonr(human_scores, cosine_pred_scores).statistic)
221
226
  )
222
227
  dot_spearman_scores.append(
223
- spearmanr(human_scores, dot_pred_scores).statistic
228
+ float(spearmanr(human_scores, dot_pred_scores).statistic)
224
229
  )
225
- dot_pearson_scores.append(pearsonr(human_scores, dot_pred_scores).statistic)
226
- spearman_scores.append(spearmanr(human_scores, sim_scores).statistic)
227
- pearson_scores.append(pearsonr(human_scores, sim_scores).statistic)
230
+ dot_pearson_scores.append(
231
+ float(pearsonr(human_scores, dot_pred_scores).statistic)
232
+ )
233
+ spearman_scores.append(float(spearmanr(human_scores, sim_scores).statistic))
234
+ pearson_scores.append(float(pearsonr(human_scores, sim_scores).statistic))
228
235
 
229
236
  return SummarizationMetrics(
230
237
  pearson=float(np.mean(pearson_scores)),
@@ -271,10 +278,10 @@ class DeprecatedSummarizationEvaluator(SummarizationEvaluator):
271
278
  pearson_scores.append(pearsonr(human_scores, sim_scores))
272
279
 
273
280
  return SummarizationMetrics(
274
- pearson=float(np.mean(pearson_scores)),
275
- spearman=float(np.mean(spearman_scores)),
276
- cosine_spearman=float(np.mean(cosine_spearman_scores)),
277
- cosine_pearson=float(np.mean(cosine_pearson_scores)),
278
- dot_pearson=float(np.mean(dot_pearson_scores)),
279
- dot_spearman=float(np.mean(dot_spearman_scores)),
281
+ pearson=float(np.mean(pearson_scores)), # type: ignore[arg-type]
282
+ spearman=float(np.mean(spearman_scores)), # type: ignore[arg-type]
283
+ cosine_spearman=float(np.mean(cosine_spearman_scores)), # type: ignore[arg-type]
284
+ cosine_pearson=float(np.mean(cosine_pearson_scores)), # type: ignore[arg-type]
285
+ dot_pearson=float(np.mean(dot_pearson_scores)), # type: ignore[arg-type]
286
+ dot_spearman=float(np.mean(dot_spearman_scores)), # type: ignore[arg-type]
280
287
  )
@@ -1,5 +1,4 @@
1
1
  import logging
2
- from typing import Any
3
2
 
4
3
  from datasets import Dataset
5
4
 
@@ -10,7 +9,7 @@ from mteb._create_dataloaders import (
10
9
  from mteb.abstasks.task_metadata import TaskMetadata
11
10
  from mteb.models import EncoderProtocol
12
11
  from mteb.similarity_functions import similarity
13
- from mteb.types import Array
12
+ from mteb.types import Array, EncodeKwargs
14
13
 
15
14
  from .evaluator import Evaluator
16
15
 
@@ -38,18 +37,21 @@ class ZeroShotClassificationEvaluator(Evaluator):
38
37
  self.hf_subset = hf_subset
39
38
 
40
39
  def __call__(
41
- self, model: EncoderProtocol, *, encode_kwargs: dict[str, Any]
40
+ self,
41
+ model: EncoderProtocol,
42
+ *,
43
+ encode_kwargs: EncodeKwargs,
42
44
  ) -> Array:
43
45
  dataloader = create_dataloader(
44
46
  self.dataset,
45
- batch_size=encode_kwargs["batch_size"],
46
47
  input_column=self.input_column_name,
47
48
  task_metadata=self.task_metadata,
49
+ **encode_kwargs,
48
50
  )
49
51
 
50
52
  logger.info("Running zero-shot classification - Encoding labels...")
51
53
  text_label_embeddings = model.encode(
52
- _create_dataloader_from_texts(self.candidate_labels),
54
+ _create_dataloader_from_texts(self.candidate_labels, **encode_kwargs),
53
55
  task_metadata=self.task_metadata,
54
56
  hf_subset=self.hf_subset,
55
57
  hf_split=self.hf_split,
File without changes
@@ -0,0 +1,125 @@
1
+ """Simplified version of https://gist.github.com/AlexeyVatolin/ea3adc21aa7a767603ff393b22085adc from https://github.com/embeddings-benchmark/mteb/pull/2900"""
2
+
3
+ import logging
4
+
5
+ import datasets
6
+ import pandas as pd
7
+ from datasets import Dataset, DatasetDict
8
+
9
+ from mteb import TaskMetadata
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def deduplicate(dataset: Dataset, input_column: str) -> Dataset:
15
+ """Remove duplicate texts, keeping the first occurrence."""
16
+ unique_texts = set()
17
+ indices_to_keep = []
18
+ for i, text in enumerate(dataset[input_column]):
19
+ text = text.strip()
20
+ if text not in unique_texts:
21
+ unique_texts.add(text)
22
+ indices_to_keep.append(i)
23
+
24
+ logger.info(
25
+ f"[deduplicate] removed={len(dataset) - len(indices_to_keep)}/{len(dataset)}"
26
+ )
27
+ return dataset.select(indices_to_keep)
28
+
29
+
30
+ def filter_empty(dataset: Dataset, input_column: str) -> Dataset:
31
+ """Filter out empty or whitespace-only examples."""
32
+ before = len(dataset)
33
+ ds = dataset.filter(lambda x: len(x[input_column].strip()) > 0)
34
+ logger.info(f"[filter_empty] removed={before - len(ds)}/{before}")
35
+ return ds
36
+
37
+
38
+ def filter_train_leakage(
39
+ train_dataset: Dataset, test_dataset: Dataset, input_column: str
40
+ ) -> Dataset:
41
+ """Remove test examples that appear in training."""
42
+ train_texts = set(train_dataset[input_column])
43
+ before = len(test_dataset)
44
+ indices = [
45
+ i
46
+ for i, text in enumerate(test_dataset[input_column])
47
+ if text not in train_texts
48
+ ]
49
+ logger.info(f"[filter_train_leakage] removed={before - len(indices)}/{before}")
50
+ return test_dataset.select(indices)
51
+
52
+
53
+ def filter_unclear_label(
54
+ dataset_dict: DatasetDict, input_column: str, label_column: str
55
+ ) -> DatasetDict:
56
+ """Remove examples where the same text appears with multiple different labels."""
57
+ normalized: dict[str, set[str | tuple[str, ...]]] = {}
58
+ logger.debug("[filter_controversial] scanning dataset for label conflicts...")
59
+
60
+ for split, ds in dataset_dict.items():
61
+ for text, label in zip(ds[input_column], ds[label_column]):
62
+ key = text.strip().lower()
63
+ normalized.setdefault(key, set()).add(
64
+ label if isinstance(label, (str, int, float)) else tuple(label) # type: ignore[arg-type]
65
+ )
66
+
67
+ bad_texts = {t for t, labels in normalized.items() if len(labels) > 1}
68
+ logger.info(f"[filter_controversial] Removing {len(bad_texts)} conflicting texts")
69
+
70
+ new_dict = {}
71
+ for split, ds in dataset_dict.items():
72
+ before = len(ds)
73
+ filtered = ds.filter(lambda x: x[input_column].strip().lower() not in bad_texts)
74
+ logger.debug(
75
+ f"[filter_controversial:{split}] removed={before - len(filtered)}/{before}"
76
+ )
77
+ new_dict[split] = filtered
78
+
79
+ return DatasetDict(new_dict)
80
+
81
+
82
+ def filter_short(dataset: Dataset, input_column: str, min_words: int = 3) -> Dataset:
83
+ """Filter out texts with fewer than `min_words`."""
84
+ before = len(dataset)
85
+ ds = dataset.filter(lambda x: len(x[input_column].strip().split()) >= min_words)
86
+ logger.debug(f"[filter_short] removed={before - len(ds)}/{before}")
87
+ return ds
88
+
89
+
90
+ def split_train_test(
91
+ ds: DatasetDict,
92
+ metadata: TaskMetadata,
93
+ train_split: str,
94
+ label_column: str,
95
+ ) -> DatasetDict:
96
+ if train_split in ds and metadata.eval_splits == train_split:
97
+ before = len(ds[train_split])
98
+ logger.info(
99
+ f"[split_train_test] eval_splits == train_split; performing split on {before} examples"
100
+ )
101
+ ds[train_split] = ds[train_split].cast_column(
102
+ label_column,
103
+ datasets.ClassLabel(names=list(set(ds[train_split][label_column]))),
104
+ )
105
+ label_counts = pd.Series(ds[train_split][label_column]).value_counts()
106
+ one_sample_labels = set(label_counts[label_counts == 1].index.tolist())
107
+
108
+ if one_sample_labels:
109
+ logger.info(
110
+ f"[split_train_test] Removing {len(one_sample_labels)} labels with only one instance"
111
+ )
112
+ ds[train_split] = ds[train_split].filter(
113
+ lambda x: x[label_column] not in one_sample_labels
114
+ )
115
+
116
+ splits = ds[train_split].train_test_split(
117
+ test_size=min(2048, before // 2), seed=42, stratify_by_column=label_column
118
+ )
119
+ ds = DatasetDict({train_split: splits[train_split], "test": splits["test"]})
120
+ metadata.eval_splits = ["test"]
121
+ logger.info(
122
+ f"[split_train_test] Train size={len(ds[train_split])}, Test size={len(ds['test'])}"
123
+ )
124
+
125
+ return ds
@@ -0,0 +1,105 @@
1
+ import logging
2
+
3
+ from datasets import DatasetDict
4
+
5
+ from mteb import TaskMetadata
6
+ from mteb.abstasks import AbsTaskClassification
7
+ from mteb.abstasks._data_filter.filters import (
8
+ deduplicate,
9
+ filter_empty,
10
+ filter_short,
11
+ filter_train_leakage,
12
+ filter_unclear_label,
13
+ split_train_test,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def clean_dataset(
20
+ ds: DatasetDict,
21
+ metadata: TaskMetadata,
22
+ train_split: str,
23
+ input_column: str,
24
+ label_column: str,
25
+ subset: str | None = None,
26
+ ) -> DatasetDict:
27
+ """Apply the full cleaning pipeline with logging."""
28
+ logger.info("[clean_dataset] Starting dataset cleaning pipeline...")
29
+
30
+ transforms = [
31
+ ("filter_empty", filter_empty),
32
+ ("deduplicate", deduplicate),
33
+ ]
34
+
35
+ skip_cjk_codes = {"zho", "jpn", "tha", "mya", "cmn"}
36
+ logger.info("[clean_dataset] Applying short-text filter")
37
+ cur_langs = (
38
+ metadata.eval_langs[subset]
39
+ if isinstance(metadata.eval_langs, dict) and subset
40
+ else metadata.eval_langs
41
+ )
42
+ apply_short = not any(lang.split("-")[0] in skip_cjk_codes for lang in cur_langs)
43
+ if apply_short:
44
+ logger.info("[clean_dataset] Applying short-text filter")
45
+ transforms.append(("filter_short", filter_short))
46
+
47
+ for split in [train_split, *metadata.eval_splits]:
48
+ if split not in ds:
49
+ logger.warning(f"[clean_dataset] Split '{split}' missing; skipping.")
50
+ continue
51
+
52
+ for name, fn in transforms:
53
+ before = len(ds[split])
54
+ ds[split] = fn(ds[split], input_column=input_column)
55
+ logger.info(
56
+ f"[clean_dataset:{split}] {name} removed={before - len(ds[split])}"
57
+ )
58
+
59
+ ds = split_train_test(ds, metadata, train_split, label_column)
60
+
61
+ for split in metadata.eval_splits:
62
+ if split == train_split:
63
+ continue
64
+ before = len(ds[split])
65
+ ds[split] = filter_train_leakage(ds[train_split], ds[split], input_column)
66
+ logger.info(
67
+ f"[clean_dataset:{split}] leakage_removed={before - len(ds[split])}"
68
+ )
69
+
70
+ ds = filter_unclear_label(ds, input_column=input_column, label_column=label_column)
71
+
72
+ logger.info("[clean_dataset] Cleaning pipeline complete.")
73
+ return ds
74
+
75
+
76
+ def process_classification(
77
+ task: AbsTaskClassification,
78
+ ) -> DatasetDict | dict[str, DatasetDict]:
79
+ """Process classification task dataset(s) with cleaning pipeline."""
80
+ if not task.data_loaded:
81
+ task.load_data()
82
+ if isinstance(task.dataset, DatasetDict):
83
+ return clean_dataset(
84
+ task.dataset,
85
+ task.metadata,
86
+ task.train_split,
87
+ task.input_column_name,
88
+ task.label_column_name,
89
+ subset=None,
90
+ )
91
+
92
+ if task.dataset is None:
93
+ raise ValueError("Task dataset is None.")
94
+
95
+ new_ds = {}
96
+ for subset in task.dataset:
97
+ new_ds[subset] = clean_dataset(
98
+ task.dataset[subset],
99
+ task.metadata,
100
+ task.train_split,
101
+ task.input_column_name,
102
+ task.label_column_name,
103
+ subset=subset,
104
+ )
105
+ return new_ds