mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ import logging
3
3
  import random
4
4
  from collections import defaultdict
5
5
  from pathlib import Path
6
- from typing import Any
6
+ from typing import Any, cast
7
7
 
8
8
  import numpy as np
9
9
  from datasets import Dataset, DatasetDict
@@ -11,8 +11,8 @@ from sklearn.cluster import MiniBatchKMeans
11
11
  from sklearn.metrics.cluster import v_measure_score
12
12
 
13
13
  from mteb._create_dataloaders import create_dataloader
14
- from mteb.models import EncoderProtocol
15
- from mteb.types import HFSubset, ScoresDict
14
+ from mteb.models import EncoderProtocol, MTEBModels
15
+ from mteb.types import Array, EncodeKwargs, HFSubset, ScoresDict
16
16
  from mteb.types.statistics import (
17
17
  ImageStatistics,
18
18
  LabelStatistics,
@@ -34,7 +34,7 @@ MultilingualDataset = dict[HFSubset, DatasetDict]
34
34
 
35
35
 
36
36
  def _evaluate_clustering_bootstrapped(
37
- embeddings: np.ndarray,
37
+ embeddings: Array,
38
38
  labels: list[list[str]],
39
39
  n_clusters: int,
40
40
  cluster_size: int,
@@ -61,21 +61,21 @@ def _evaluate_clustering_bootstrapped(
61
61
  max_depth = max(map(len, labels))
62
62
  # Evaluate on each level til max depth
63
63
  for i_level in range(max_depth):
64
- level_labels = []
64
+ level_labels: list[str | int] = []
65
65
  # Assign -1 to gold label if the level is not there
66
66
  for label in labels:
67
67
  if len(label) > i_level:
68
68
  level_labels.append(label[i_level])
69
69
  else:
70
70
  level_labels.append(-1)
71
- level_labels = np.array(level_labels)
71
+ np_level_labels = np.array(level_labels)
72
72
  valid_idx = np.array(
73
- [level_label != -1 for level_label in level_labels]
73
+ [level_label != -1 for level_label in np_level_labels]
74
74
  ) # Could be level_labels != -1 but fails with FutureWarning: elementwise comparison failed
75
- level_labels = level_labels[valid_idx]
75
+ np_level_labels = np_level_labels[valid_idx]
76
76
  level_embeddings = embeddings[valid_idx]
77
77
  clustering_model = MiniBatchKMeans(
78
- n_clusters=np.unique(level_labels).size,
78
+ n_clusters=np.unique(np_level_labels).size,
79
79
  batch_size=kmean_batch_size,
80
80
  init="k-means++",
81
81
  n_init=1, # default when kmeans++ is used
@@ -87,7 +87,7 @@ def _evaluate_clustering_bootstrapped(
87
87
  cluster_indices = rng_state.choices(range(n_embeddings), k=cluster_size)
88
88
 
89
89
  _embeddings = level_embeddings[cluster_indices]
90
- _labels = level_labels[cluster_indices]
90
+ _labels = np_level_labels[cluster_indices]
91
91
  cluster_assignment = clustering_model.fit_predict(_embeddings)
92
92
  v_measure = v_measure_score(_labels, cluster_assignment)
93
93
  v_measures[f"Level {i_level}"].append(v_measure)
@@ -153,15 +153,19 @@ class AbsTaskClustering(AbsTask):
153
153
 
154
154
  def _evaluate_subset(
155
155
  self,
156
- model: EncoderProtocol,
156
+ model: MTEBModels,
157
157
  data_split: Dataset,
158
158
  *,
159
- encode_kwargs: dict[str, Any],
159
+ encode_kwargs: EncodeKwargs,
160
160
  hf_split: str,
161
161
  hf_subset: str,
162
162
  prediction_folder: Path | None = None,
163
163
  **kwargs: Any,
164
164
  ) -> ScoresDict:
165
+ if not isinstance(model, EncoderProtocol):
166
+ raise TypeError(
167
+ "Expected encoder model to be an instance of EncoderProtocol."
168
+ )
165
169
  if (
166
170
  self.max_document_to_embed is not None
167
171
  and self.max_fraction_of_documents_to_embed is not None
@@ -182,13 +186,13 @@ class AbsTaskClustering(AbsTask):
182
186
  self.max_fraction_of_documents_to_embed * len(data_split)
183
187
  )
184
188
  else:
185
- max_documents_to_embed = self.max_document_to_embed
189
+ max_documents_to_embed = cast(int, self.max_document_to_embed)
186
190
 
187
- max_documents_to_embed = min(len(data_split), max_documents_to_embed) # type: ignore
191
+ max_documents_to_embed = min(len(data_split), max_documents_to_embed)
188
192
  example_indices = self.rng_state.sample(
189
193
  range(len(data_split)), k=max_documents_to_embed
190
194
  )
191
- downsampled_dataset = data_split.select(example_indices) # type: ignore
195
+ downsampled_dataset = data_split.select(example_indices)
192
196
 
193
197
  downsampled_dataset = downsampled_dataset.select_columns(
194
198
  [self.input_column_name, self.label_column_name]
@@ -200,7 +204,7 @@ class AbsTaskClustering(AbsTask):
200
204
  downsampled_dataset,
201
205
  self.metadata,
202
206
  input_column=self.input_column_name,
203
- batch_size=encode_kwargs["batch_size"],
207
+ **encode_kwargs,
204
208
  ),
205
209
  task_metadata=self.metadata,
206
210
  hf_subset=hf_subset,
@@ -8,8 +8,8 @@ from scipy.optimize import linear_sum_assignment
8
8
  from sklearn import metrics
9
9
 
10
10
  from mteb._evaluators import ClusteringEvaluator
11
- from mteb.models import EncoderProtocol
12
- from mteb.types import ScoresDict
11
+ from mteb.models import EncoderProtocol, MTEBModels
12
+ from mteb.types import EncodeKwargs, ScoresDict
13
13
  from mteb.types.statistics import (
14
14
  ImageStatistics,
15
15
  LabelStatistics,
@@ -80,15 +80,21 @@ class AbsTaskClusteringLegacy(AbsTask):
80
80
 
81
81
  def _evaluate_subset(
82
82
  self,
83
- model: EncoderProtocol,
83
+ model: MTEBModels,
84
84
  data_split: Dataset,
85
85
  *,
86
- encode_kwargs: dict[str, Any],
86
+ encode_kwargs: EncodeKwargs,
87
87
  hf_split: str,
88
88
  hf_subset: str,
89
89
  prediction_folder: Path | None = None,
90
90
  **kwargs: Any,
91
91
  ) -> ScoresDict:
92
+ if not isinstance(model, EncoderProtocol):
93
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
94
+
95
+ data_split = data_split.select_columns(
96
+ [self.input_column_name, self.label_column_name]
97
+ )
92
98
  # MTEB text clustering requires renaming and eval per subset.
93
99
  if self.metadata.modalities == ["text"]:
94
100
  all_metrics = []
@@ -136,9 +142,6 @@ class AbsTaskClusteringLegacy(AbsTask):
136
142
  }
137
143
  return scores
138
144
 
139
- data_split = data_split.select_columns(
140
- [self.input_column_name, self.label_column_name]
141
- )
142
145
  evaluator = self.evaluator(
143
146
  data_split,
144
147
  input_column_name=self.input_column_name,
@@ -148,10 +151,10 @@ class AbsTaskClusteringLegacy(AbsTask):
148
151
  hf_subset=hf_subset,
149
152
  **kwargs,
150
153
  )
151
- clusters = evaluator(model, encode_kwargs=encode_kwargs)
154
+ evaluate_clusters = evaluator(model, encode_kwargs=encode_kwargs)
152
155
  if prediction_folder:
153
156
  self._save_task_predictions(
154
- clusters,
157
+ evaluate_clusters,
155
158
  model,
156
159
  prediction_folder,
157
160
  hf_subset=hf_subset,
@@ -160,7 +163,7 @@ class AbsTaskClusteringLegacy(AbsTask):
160
163
 
161
164
  return self._compute_metrics(
162
165
  data_split[self.label_column_name],
163
- clusters,
166
+ evaluate_clusters,
164
167
  )
165
168
 
166
169
  def _compute_metrics(
@@ -12,7 +12,8 @@ from mteb.abstasks._statistics_calculation import (
12
12
  calculate_text_statistics,
13
13
  )
14
14
  from mteb.abstasks.abstask import AbsTask
15
- from mteb.models.models_protocols import EncoderProtocol
15
+ from mteb.models.models_protocols import EncoderProtocol, MTEBModels
16
+ from mteb.types import EncodeKwargs
16
17
  from mteb.types.statistics import (
17
18
  ImageStatistics,
18
19
  SplitDescriptiveStatistics,
@@ -116,15 +117,17 @@ class AbsTaskImageTextPairClassification(AbsTask):
116
117
 
117
118
  def _evaluate_subset(
118
119
  self,
119
- model: EncoderProtocol,
120
+ model: MTEBModels,
120
121
  data_split: Dataset,
121
122
  *,
122
- encode_kwargs: dict[str, Any],
123
+ encode_kwargs: EncodeKwargs,
123
124
  hf_split: str,
124
125
  hf_subset: str,
125
126
  prediction_folder: Path | None = None,
126
127
  **kwargs: Any,
127
128
  ) -> ImageTextPairClassificationMetrics:
129
+ if not isinstance(model, EncoderProtocol):
130
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
128
131
  select_columns = []
129
132
  for columns in (self.images_column_names, self.texts_column_names):
130
133
  if isinstance(columns, str):
@@ -154,7 +157,7 @@ class AbsTaskImageTextPairClassification(AbsTask):
154
157
  hf_subset=hf_subset,
155
158
  **kwargs,
156
159
  )
157
- scores = evaluator(model, encode_kwargs=encode_kwargs)
160
+ scores: list[torch.Tensor] = evaluator(model, encode_kwargs=encode_kwargs) # type: ignore[assignment]
158
161
  if prediction_folder:
159
162
  self._save_task_predictions(
160
163
  [score.tolist() for score in scores],
@@ -14,8 +14,10 @@ from sklearn.preprocessing import MultiLabelBinarizer
14
14
  from typing_extensions import override
15
15
 
16
16
  from mteb._create_dataloaders import create_dataloader
17
+ from mteb._evaluators.classification_metrics import hamming_score
17
18
  from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
18
- from mteb.models import EncoderProtocol
19
+ from mteb.models import EncoderProtocol, MTEBModels
20
+ from mteb.types import Array, EncodeKwargs
19
21
 
20
22
  from .classification import AbsTaskClassification
21
23
 
@@ -23,14 +25,14 @@ logger = logging.getLogger(__name__)
23
25
 
24
26
 
25
27
  def _evaluate_classifier(
26
- embeddings_train: np.ndarray,
28
+ embeddings_train: Array,
27
29
  y_train: np.ndarray,
28
- embeddings_test: np.ndarray,
30
+ embeddings_test: Array,
29
31
  classifier: SklearnModelProtocol,
30
32
  ) -> tuple[np.ndarray, SklearnModelProtocol]:
31
- classifier: SklearnModelProtocol = clone(classifier)
32
- classifier.fit(embeddings_train, y_train)
33
- return classifier.predict(embeddings_test), classifier
33
+ classifier_copy: SklearnModelProtocol = clone(classifier)
34
+ classifier_copy.fit(embeddings_train, y_train)
35
+ return classifier_copy.predict(embeddings_test), classifier_copy
34
36
 
35
37
 
36
38
  class MultilabelClassificationMetrics(TypedDict):
@@ -40,11 +42,13 @@ class MultilabelClassificationMetrics(TypedDict):
40
42
  accuracy: Accuracy of the classifier.
41
43
  lrap: Label Ranking Average Precision (LRAP) score.
42
44
  f1: Macro F1 score.
45
+ hamming: Hamming score (label-based accuracy).
43
46
  """
44
47
 
45
48
  accuracy: float
46
49
  lrap: float
47
50
  f1: float
51
+ hamming: float
48
52
 
49
53
 
50
54
  class FullMultilabelClassificationMetrics(MultilabelClassificationMetrics):
@@ -66,25 +70,28 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
66
70
  input_column_name: Name of the column containing the input text.
67
71
  label_column_name: Name of the column containing the labels.
68
72
  samples_per_label: Number of samples to use pr. label. These samples are embedded and a classifier is fit using the labels and samples.
69
- evaluator: Classifier to use for evaluation. Must implement the SklearnModelProtocol.
73
+ evaluator_model: Classifier to use for evaluation. Must implement the SklearnModelProtocol.
70
74
  """
71
75
 
72
- evaluator: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5)
76
+ evaluator_model: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5)
73
77
  input_column_name: str = "text"
74
78
  label_column_name: str = "label"
75
79
 
76
80
  @override
77
- def _evaluate_subset(
81
+ def _evaluate_subset( # type: ignore[override]
78
82
  self,
79
- model: EncoderProtocol,
83
+ model: MTEBModels,
80
84
  data_split: DatasetDict,
81
85
  *,
82
- encode_kwargs: dict[str, Any],
86
+ encode_kwargs: EncodeKwargs,
83
87
  hf_split: str,
84
88
  hf_subset: str,
85
89
  prediction_folder: Path | None = None,
86
90
  **kwargs: Any,
87
91
  ) -> FullMultilabelClassificationMetrics:
92
+ if not isinstance(model, EncoderProtocol):
93
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
94
+
88
95
  if isinstance(data_split, DatasetDict):
89
96
  data_split = data_split.select_columns(
90
97
  [self.input_column_name, self.label_column_name]
@@ -112,7 +119,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
112
119
  unique_train_dataset,
113
120
  self.metadata,
114
121
  input_column=self.input_column_name,
115
- batch_size=encode_kwargs["batch_size"],
122
+ **encode_kwargs,
116
123
  )
117
124
 
118
125
  logger.info("Running multilabel classification - Encoding training set...")
@@ -141,7 +148,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
141
148
  test_dataset.select_columns(self.input_column_name),
142
149
  self.metadata,
143
150
  input_column=self.input_column_name,
144
- batch_size=encode_kwargs["batch_size"],
151
+ **encode_kwargs,
145
152
  )
146
153
 
147
154
  logger.info("Running multilabel classification - Encoding test set...")
@@ -157,12 +164,12 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
157
164
 
158
165
  logger.info("Running multilabel classification - Evaluating classifiers...")
159
166
  all_predictions = []
160
- for i_experiment, sample_indices in enumerate(train_samples):
167
+ for _, sample_indices in enumerate(train_samples):
161
168
  X_train = np.stack([unique_train_embeddings[idx] for idx in sample_indices])
162
169
  y_train = train_split.select(sample_indices)[self.label_column_name]
163
170
  y_train = binarizer.transform(y_train)
164
171
  y_pred, current_classifier = _evaluate_classifier(
165
- X_train, y_train, X_test, self.evaluator
172
+ X_train, y_train, X_test, self.evaluator_model
166
173
  )
167
174
  if prediction_folder:
168
175
  all_predictions.append(y_pred.tolist())
@@ -182,19 +189,20 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
182
189
  )
183
190
 
184
191
  avg_scores: dict[str, Any] = {
185
- k: np.mean([s[k] for s in scores]) for k in scores[0].keys()
192
+ k: np.mean([s[k] for s in scores]) # type: ignore[literal-required]
193
+ for k in scores[0].keys()
186
194
  }
187
195
  logger.info("Running multilabel classification - Finished.")
188
196
  return FullMultilabelClassificationMetrics(
189
197
  scores_per_experiment=scores,
190
- **avg_scores,
198
+ **avg_scores, # type: ignore[typeddict-item]
191
199
  )
192
200
 
193
- def _calculate_scores(
201
+ def _calculate_scores( # type: ignore[override]
194
202
  self,
195
203
  y_test: np.ndarray,
196
204
  y_pred: np.ndarray,
197
- x_test_embedding: np.ndarray,
205
+ x_test_embedding: Array,
198
206
  current_classifier: SklearnModelProtocol,
199
207
  ) -> MultilabelClassificationMetrics:
200
208
  accuracy = current_classifier.score(x_test_embedding, y_test)
@@ -207,10 +215,12 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
207
215
  else:
208
216
  lrap = label_ranking_average_precision_score(y_test, y_pred)
209
217
  f1 = f1_score(y_test, y_pred, average="macro")
218
+ hamming = hamming_score(y_test, y_pred)
210
219
  return MultilabelClassificationMetrics(
211
220
  accuracy=accuracy,
212
221
  lrap=lrap,
213
222
  f1=f1,
223
+ hamming=hamming,
214
224
  )
215
225
 
216
226
  def _undersample_data_indices(
@@ -218,6 +228,8 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
218
228
  ) -> tuple[list[int], list[int]]:
219
229
  """Undersample data to have samples_per_label samples of each label.
220
230
 
231
+ Currently ensures that each label has at least samples_per_label samples.
232
+
221
233
  Returns:
222
234
  A tuple containing:
223
235
  - List of sampled indices.
@@ -225,10 +237,9 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
225
237
  """
226
238
  sample_indices = []
227
239
  if idxs is None:
228
- idxs = np.arange(len(y))
240
+ idxs = list(np.arange(len(y)))
229
241
  self.np_rng.shuffle(idxs)
230
- idxs = idxs.tolist()
231
- label_counter = defaultdict(int)
242
+ label_counter: dict[int, int] = defaultdict(int)
232
243
  for i in idxs:
233
244
  if any((label_counter[label] < samples_per_label) for label in y[i]):
234
245
  sample_indices.append(i)
@@ -18,7 +18,8 @@ from mteb.abstasks._statistics_calculation import (
18
18
  )
19
19
  from mteb.abstasks.abstask import AbsTask
20
20
  from mteb.models.model_meta import ScoringFunction
21
- from mteb.models.models_protocols import EncoderProtocol
21
+ from mteb.models.models_protocols import EncoderProtocol, MTEBModels
22
+ from mteb.types import EncodeKwargs, PromptType
22
23
  from mteb.types.statistics import (
23
24
  ImageStatistics,
24
25
  LabelStatistics,
@@ -35,7 +36,7 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics):
35
36
  Attributes:
36
37
  num_samples: number of samples in the dataset.
37
38
  number_of_characters: Total number of symbols in the dataset.
38
- unique_text_pairs: Number of unique pairs
39
+ unique_pairs: Number of unique pairs
39
40
 
40
41
  text1_statistics: Statistics for sentence1
41
42
  text2_statistics: Statistics for sentence2
@@ -43,8 +44,8 @@ class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics):
43
44
  """
44
45
 
45
46
  num_samples: int
46
- number_of_characters: int
47
- unique_pairs: int
47
+ number_of_characters: int | None
48
+ unique_pairs: int | None
48
49
 
49
50
  text1_statistics: TextStatistics | None
50
51
  image1_statistics: ImageStatistics | None
@@ -65,24 +66,31 @@ class AbsTaskPairClassification(AbsTask):
65
66
  input2_column_name: The name of the column containing the second sentence in the pair.
66
67
  label_column_name: The name of the column containing the labels for the pairs. Labels should be 0 or 1.
67
68
  abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
69
+ input1_prompt_type: Type of prompt of first input. Used for asymmetric tasks.
70
+ input2_prompt_type: Type of prompt of second input. Used for asymmetric tasks.
68
71
  """
69
72
 
70
73
  abstask_prompt = "Retrieve text that are semantically similar to the given text."
71
74
  input1_column_name: str = "sentence1"
72
75
  input2_column_name: str = "sentence2"
73
76
  label_column_name: str = "labels"
77
+ input1_prompt_type: PromptType | None = None
78
+ input2_prompt_type: PromptType | None = None
74
79
 
75
80
  def _evaluate_subset(
76
81
  self,
77
- model: EncoderProtocol,
82
+ model: MTEBModels,
78
83
  data_split: Dataset,
79
84
  *,
80
85
  hf_split: str,
81
86
  hf_subset: str,
82
- encode_kwargs: dict[str, str],
87
+ encode_kwargs: EncodeKwargs,
83
88
  prediction_folder: Path | None = None,
84
89
  **kwargs,
85
90
  ) -> dict[str, float]:
91
+ if not isinstance(model, EncoderProtocol):
92
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
93
+
86
94
  if self.metadata.modalities == ["text"]:
87
95
  # for compatibility with v1 version where datasets were stored in a single row
88
96
  data_split = data_split[0] if len(data_split) == 1 else data_split
@@ -93,6 +101,8 @@ class AbsTaskPairClassification(AbsTask):
93
101
  task_metadata=self.metadata,
94
102
  hf_split=hf_split,
95
103
  hf_subset=hf_subset,
104
+ input1_prompt_type=self.input1_prompt_type,
105
+ input2_prompt_type=self.input2_prompt_type,
96
106
  **kwargs,
97
107
  )
98
108
  similarity_scores = evaluator(model, encode_kwargs=encode_kwargs)
@@ -113,7 +123,7 @@ class AbsTaskPairClassification(AbsTask):
113
123
  self, similarity_scores: PairClassificationDistances, labels: list[int]
114
124
  ) -> dict[str, float]:
115
125
  logger.info("Computing metrics...")
116
- labels = np.asarray(labels)
126
+ np_labels = np.asarray(labels)
117
127
  output_scores = {}
118
128
  max_scores = defaultdict(list)
119
129
  for short_name, scores, reverse in [
@@ -135,7 +145,7 @@ class AbsTaskPairClassification(AbsTask):
135
145
  ],
136
146
  [ScoringFunction.DOT_PRODUCT.value, similarity_scores["dot_scores"], True],
137
147
  ]:
138
- metrics = self._compute_metrics_values(scores, labels, reverse)
148
+ metrics = self._compute_metrics_values(scores, np_labels, reverse) # type: ignore[arg-type]
139
149
  for metric_name, metric_value in metrics.items():
140
150
  output_scores[f"{short_name}_{metric_name}"] = metric_value
141
151
  max_scores[metric_name].append(metric_value)
@@ -230,6 +240,12 @@ class AbsTaskPairClassification(AbsTask):
230
240
 
231
241
  def _push_dataset_to_hub(self, repo_name: str) -> None:
232
242
  # previously pair classification datasets were stored in a single row
243
+ if self.dataset is None:
244
+ # overall this shouldn't happen as we check for dataset before pushing to hub
245
+ # added here for type checking purposes
246
+ raise RuntimeError(
247
+ "Dataset not loaded. To load dataset run `task.load_data()`."
248
+ )
233
249
  if self.metadata.is_multilingual:
234
250
  for subset in self.dataset:
235
251
  for split in self.dataset[subset]:
@@ -283,13 +299,13 @@ class AbsTaskPairClassification(AbsTask):
283
299
  )
284
300
 
285
301
  def _find_best_acc_and_threshold(
286
- self, scores: np.ndarray, labels: np.ndarray, high_score_more_similar: bool
302
+ self, scores: list[float], labels: np.ndarray, high_score_more_similar: bool
287
303
  ) -> tuple[float, float]:
288
304
  rows = list(zip(scores, labels))
289
305
  rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
290
306
 
291
307
  max_acc = 0
292
- best_threshold = -1
308
+ best_threshold = -1.0
293
309
  positive_so_far = 0
294
310
  remaining_negatives = sum(np.array(labels) == 0)
295
311
 
@@ -316,7 +332,7 @@ class AbsTaskPairClassification(AbsTask):
316
332
 
317
333
  rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
318
334
 
319
- best_f1 = best_precision = best_recall = 0
335
+ best_f1 = best_precision = best_recall = 0.0
320
336
  threshold = 0
321
337
  nextract = 0
322
338
  ncorrect = 0
@@ -84,10 +84,10 @@ class AbsTaskRegression(AbsTaskClassification):
84
84
  n_samples: Number of samples to use for training the regression model. If the dataset has fewer samples than n_samples, all samples are used.
85
85
  abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
86
86
  evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LinearRegression`.
87
- Full details of api in [`SklearnModelProtocol`][mteb._evaluators.sklearn_evaluator.SklearnModelProtocol].
87
+
88
88
  """
89
89
 
90
- evaluator: type[SklearnModelProtocol] = SklearnEvaluator
90
+ evaluator: type[SklearnEvaluator] = SklearnEvaluator
91
91
  evaluator_model: SklearnModelProtocol = LinearRegression(n_jobs=-1)
92
92
 
93
93
  train_split: str = "train"
@@ -113,7 +113,7 @@ class AbsTaskRegression(AbsTaskClassification):
113
113
  )["train"]
114
114
  return train_split_sampled, []
115
115
 
116
- def _calculate_scores(
116
+ def _calculate_scores( # type: ignore[override]
117
117
  self,
118
118
  y_test: np.ndarray | list[int],
119
119
  y_pred: np.ndarray,
@@ -183,7 +183,7 @@ class AbsTaskRegression(AbsTaskClassification):
183
183
 
184
184
  return dataset_dict
185
185
 
186
- def _calculate_descriptive_statistics_from_split(
186
+ def _calculate_descriptive_statistics_from_split( # type: ignore[override]
187
187
  self, split: str, hf_subset: str | None = None, compute_overall: bool = False
188
188
  ) -> RegressionDescriptiveStatistics:
189
189
  train_text = []