mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  from collections import defaultdict
4
- from collections.abc import Callable, Sequence
4
+ from collections.abc import Callable, Mapping, Sequence
5
5
  from pathlib import Path
6
6
  from time import time
7
7
  from typing import Any, Literal
@@ -25,6 +25,7 @@ from mteb.models import (
25
25
  SearchProtocol,
26
26
  )
27
27
  from mteb.types import (
28
+ EncodeKwargs,
28
29
  HFSubset,
29
30
  QueryDatasetType,
30
31
  RelevantDocumentsType,
@@ -184,17 +185,17 @@ class AbsTaskRetrieval(AbsTask):
184
185
  return queries, corpus
185
186
 
186
187
  if self.metadata.is_multilingual:
187
- for subset in self.queries:
188
- for split in self.queries[subset]:
189
- queries = self.queries[subset][split]
190
- corpus = self.corpus[subset][split]
188
+ for subset in self.queries: # type: ignore[attr-defined]
189
+ for split in self.queries[subset]: # type: ignore[attr-defined]
190
+ queries = self.queries[subset][split] # type: ignore[attr-defined]
191
+ corpus = self.corpus[subset][split] # type: ignore[attr-defined]
191
192
 
192
193
  (
193
194
  self.dataset[subset][split]["queries"],
194
195
  self.dataset[subset][split]["corpus"],
195
196
  ) = _process_split(queries, corpus)
196
197
 
197
- self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[
198
+ self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[ # type: ignore[attr-defined]
198
199
  subset
199
200
  ][split]
200
201
  if hasattr(self, "instructions"):
@@ -211,15 +212,15 @@ class AbsTaskRetrieval(AbsTask):
211
212
  ][split]
212
213
  else:
213
214
  subset = "default"
214
- for split in self.queries:
215
- queries = self.queries[split]
216
- corpus = self.corpus[split]
215
+ for split in self.queries: # type: ignore[attr-defined]
216
+ queries = self.queries[split] # type: ignore[attr-defined]
217
+ corpus = self.corpus[split] # type: ignore[attr-defined]
217
218
  (
218
219
  self.dataset[subset][split]["queries"],
219
220
  self.dataset[subset][split]["corpus"],
220
221
  ) = _process_split(queries, corpus)
221
222
 
222
- self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[
223
+ self.dataset[subset][split]["relevant_docs"] = self.relevant_docs[ # type: ignore[attr-defined]
223
224
  split
224
225
  ].copy()
225
226
  if hasattr(self, "instructions"):
@@ -235,9 +236,9 @@ class AbsTaskRetrieval(AbsTask):
235
236
  split
236
237
  ].copy()
237
238
 
238
- del self.queries
239
- del self.corpus
240
- del self.relevant_docs
239
+ del self.queries # type: ignore[attr-defined]
240
+ del self.corpus # type: ignore[attr-defined]
241
+ del self.relevant_docs # type: ignore[attr-defined]
241
242
  if hasattr(self, "instructions"):
242
243
  del self.instructions
243
244
  if hasattr(self, "top_ranked"):
@@ -283,10 +284,10 @@ class AbsTaskRetrieval(AbsTask):
283
284
  split: str = "test",
284
285
  subsets_to_run: list[HFSubset] | None = None,
285
286
  *,
286
- encode_kwargs: dict[str, Any],
287
+ encode_kwargs: EncodeKwargs,
287
288
  prediction_folder: Path | None = None,
288
- **kwargs,
289
- ) -> dict[HFSubset, ScoresDict]:
289
+ **kwargs: Any,
290
+ ) -> Mapping[HFSubset, ScoresDict]:
290
291
  """Evaluate the model on the retrieval task.
291
292
 
292
293
  Args:
@@ -320,7 +321,7 @@ class AbsTaskRetrieval(AbsTask):
320
321
  self,
321
322
  model: MTEBModels,
322
323
  data_split: RetrievalSplitData,
323
- encode_kwargs: dict[str, Any],
324
+ encode_kwargs: EncodeKwargs,
324
325
  hf_split: str,
325
326
  hf_subset: str,
326
327
  prediction_folder: Path | None = None,
@@ -357,6 +358,8 @@ class AbsTaskRetrieval(AbsTask):
357
358
  **kwargs,
358
359
  )
359
360
 
361
+ search_model: SearchProtocol
362
+
360
363
  if isinstance(model, EncoderProtocol) and not isinstance(model, SearchProtocol):
361
364
  search_model = SearchEncoderWrapper(model)
362
365
  elif isinstance(model, CrossEncoderProtocol):
@@ -578,11 +581,12 @@ class AbsTaskRetrieval(AbsTask):
578
581
  if isinstance(data[split][subset_item], Dataset):
579
582
  sections[split] = data[split][subset_item]
580
583
  elif converter is not None:
584
+ subset_data = data[split][subset_item]
585
+ if subset_data is None:
586
+ continue
587
+
581
588
  sections[split] = Dataset.from_list(
582
- [
583
- converter(idx, item)
584
- for idx, item in data[split][subset_item].items()
585
- ]
589
+ [converter(idx, item) for idx, item in subset_data.items()]
586
590
  )
587
591
  else:
588
592
  raise ValueError(
@@ -680,7 +684,7 @@ class AbsTaskRetrieval(AbsTask):
680
684
 
681
685
  top_k_sorted = defaultdict(list)
682
686
  for query_id, values in top_ranked.items():
683
- sorted_keys = sorted(values, key=values.get, reverse=True)
687
+ sorted_keys = sorted(values, key=lambda k: values[k], reverse=True)
684
688
  top_k_sorted[query_id] = sorted_keys[: self._top_k]
685
689
 
686
690
  self.dataset[subset][split]["top_ranked"] = top_k_sorted
@@ -688,10 +692,10 @@ class AbsTaskRetrieval(AbsTask):
688
692
 
689
693
 
690
694
  def _process_relevant_docs(
691
- collection: dict[str, dict[str, float]],
695
+ collection: Mapping[str, Mapping[str, int]],
692
696
  hf_subset: str,
693
697
  split: str,
694
- ) -> dict[str, dict[str, float]]:
698
+ ) -> dict[str, dict[str, int]]:
695
699
  """Collections can contain overlapping ids in different splits. Prepend split and subset to avoid this
696
700
 
697
701
  Returns:
@@ -136,7 +136,7 @@ class RetrievalDatasetLoader:
136
136
  "_id", "id"
137
137
  )
138
138
  logger.info("Loaded %d %s Documents.", len(corpus_ds), self.split.upper())
139
- logger.info("Doc Example: %s", corpus_ds[0])
139
+ logger.debug("Doc Example: %s", corpus_ds[0])
140
140
  return corpus_ds
141
141
 
142
142
  def _load_queries(self) -> QueryDatasetType:
@@ -152,7 +152,7 @@ class RetrievalDatasetLoader:
152
152
  )
153
153
 
154
154
  logger.info("Loaded %d %s queries.", len(queries_ds), self.split.upper())
155
- logger.info("Query Example: %s", queries_ds[0])
155
+ logger.debug("Query Example: %s", queries_ds[0])
156
156
 
157
157
  return queries_ds
158
158
 
mteb/abstasks/sts.py CHANGED
@@ -7,7 +7,8 @@ from scipy.stats import pearsonr, spearmanr
7
7
 
8
8
  from mteb._evaluators import AnySTSEvaluator
9
9
  from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
10
- from mteb.models import EncoderProtocol
10
+ from mteb.models import EncoderProtocol, MTEBModels
11
+ from mteb.types import EncodeKwargs, PromptType
11
12
  from mteb.types.statistics import (
12
13
  ImageStatistics,
13
14
  ScoreStatistics,
@@ -89,23 +90,30 @@ class AbsTaskSTS(AbsTask):
89
90
  min_score: Minimum possible score in the dataset.
90
91
  max_score: Maximum possible score in the dataset.
91
92
  abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
93
+ input1_prompt_type: Type of prompt of first input. Used for asymmetric tasks.
94
+ input2_prompt_type: Type of prompt of second input. Used for asymmetric tasks.
92
95
  """
93
96
 
94
97
  abstask_prompt = "Retrieve semantically similar text."
95
98
  column_names: tuple[str, str] = ("sentence1", "sentence2")
96
99
  min_score: int = 0
97
100
  max_score: int = 5
101
+ input1_prompt_type: PromptType | None = None
102
+ input2_prompt_type: PromptType | None = None
98
103
 
99
104
  def _evaluate_subset(
100
105
  self,
101
- model: EncoderProtocol,
106
+ model: MTEBModels,
102
107
  data_split: Dataset,
103
- encode_kwargs: dict[str, Any],
108
+ encode_kwargs: EncodeKwargs,
104
109
  hf_split: str,
105
110
  hf_subset: str,
106
111
  prediction_folder: Path | None = None,
107
112
  **kwargs: Any,
108
113
  ) -> STSMetrics:
114
+ if not isinstance(model, EncoderProtocol):
115
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
116
+
109
117
  normalized_scores = list(map(self._normalize, data_split["score"]))
110
118
  data_split = data_split.select_columns(list(self.column_names))
111
119
 
@@ -115,6 +123,8 @@ class AbsTaskSTS(AbsTask):
115
123
  task_metadata=self.metadata,
116
124
  hf_split=hf_split,
117
125
  hf_subset=hf_subset,
126
+ input1_prompt_type=self.input1_prompt_type,
127
+ input2_prompt_type=self.input2_prompt_type,
118
128
  **kwargs,
119
129
  )
120
130
  scores = evaluator(model, encode_kwargs=encode_kwargs)
@@ -135,7 +145,7 @@ class AbsTaskSTS(AbsTask):
135
145
  ) -> STSMetrics:
136
146
  def compute_corr(x: list[float], y: list[float]) -> tuple[float, float]:
137
147
  """Return (pearson, spearman) correlations between x and y."""
138
- return pearsonr(x, y)[0], spearmanr(x, y)[0]
148
+ return float(pearsonr(x, y)[0]), float(spearmanr(x, y)[0])
139
149
 
140
150
  cosine_pearson, cosine_spearman = compute_corr(
141
151
  normalized_scores, scores["cosine_scores"]
@@ -2,9 +2,10 @@ import json
2
2
  import logging
3
3
  from collections.abc import Sequence
4
4
  from pathlib import Path
5
- from typing import Any, Literal
5
+ from typing import Any, Literal, cast
6
6
 
7
7
  from huggingface_hub import (
8
+ CardData,
8
9
  DatasetCard,
9
10
  DatasetCardData,
10
11
  constants,
@@ -107,6 +108,7 @@ The domains follow the categories used in the [Universal Dependencies project](h
107
108
  SampleCreationMethod = Literal[
108
109
  "found",
109
110
  "created",
111
+ "created and machine-translated",
110
112
  "human-translated and localized",
111
113
  "human-translated",
112
114
  "machine-translated",
@@ -149,7 +151,7 @@ _TASK_TYPE = (
149
151
  "InstructionReranking",
150
152
  ) + MIEB_TASK_TYPE
151
153
 
152
- TaskType = Literal[_TASK_TYPE]
154
+ TaskType = Literal[_TASK_TYPE] # type: ignore[valid-type]
153
155
  """The type of the task. E.g. includes "Classification", "Retrieval" and "Clustering"."""
154
156
 
155
157
 
@@ -191,8 +193,10 @@ AnnotatorType = Literal[
191
193
  """The type of the annotators. Is often important for understanding the quality of a dataset."""
192
194
 
193
195
 
194
- PromptDict = TypedDict(
195
- "PromptDict", {prompt_type.value: str for prompt_type in PromptType}, total=False
196
+ PromptDict = TypedDict( # type: ignore[misc]
197
+ "PromptDict",
198
+ {prompt_type.value: str for prompt_type in PromptType},
199
+ total=False,
196
200
  )
197
201
  """A dictionary containing the prompt used for the task.
198
202
 
@@ -364,7 +368,7 @@ class TaskMetadata(BaseModel):
364
368
  """Return a dictionary mapping huggingface subsets to languages."""
365
369
  if isinstance(self.eval_langs, dict):
366
370
  return self.eval_langs
367
- return {"default": self.eval_langs} # type: ignore
371
+ return {"default": cast(list[str], self.eval_langs)}
368
372
 
369
373
  @property
370
374
  def intext_citation(self, include_cite: bool = True) -> str:
@@ -375,9 +379,8 @@ class TaskMetadata(BaseModel):
375
379
  if include_cite and cite:
376
380
  # check for whitespace in the citation
377
381
  if " " in cite:
378
- logger.warning(
379
- "Citation contains whitespace. Please ensure that the citation is correctly formatted."
380
- )
382
+ msg = "Citation contains whitespace. Please ensure that the citation is correctly formatted."
383
+ logger.warning(msg)
381
384
  return f"\\cite{{{cite}}}"
382
385
  return cite
383
386
 
@@ -413,7 +416,7 @@ class TaskMetadata(BaseModel):
413
416
  for subset, subset_value in stats.items():
414
417
  if subset == "hf_subset_descriptive_stats":
415
418
  continue
416
- n_samples[subset] = subset_value["num_samples"] # type: ignore
419
+ n_samples[subset] = subset_value["num_samples"]
417
420
  return n_samples
418
421
 
419
422
  @property
@@ -446,7 +449,7 @@ class TaskMetadata(BaseModel):
446
449
  Raises:
447
450
  ValueError: If the prompt type is not recognized.
448
451
  """
449
- if prompt_type is None:
452
+ if prompt_type is None or self.category is None:
450
453
  return self.modalities
451
454
  query_modalities, doc_modalities = self.category.split("2")
452
455
  category_to_modality: dict[str, Modalities] = {
@@ -466,7 +469,7 @@ class TaskMetadata(BaseModel):
466
469
 
467
470
  def _create_dataset_card_data(
468
471
  self,
469
- existing_dataset_card_data: DatasetCardData | None = None,
472
+ existing_dataset_card_data: CardData | None = None,
470
473
  ) -> tuple[DatasetCardData, dict[str, Any]]:
471
474
  """Create a DatasetCardData object from the task metadata.
472
475
 
@@ -482,7 +485,6 @@ class TaskMetadata(BaseModel):
482
485
  dataset_type = [
483
486
  *self._hf_task_type(),
484
487
  *self._hf_task_category(),
485
- *self._hf_subtypes(),
486
488
  ]
487
489
  languages = self._hf_languages()
488
490
 
@@ -501,12 +503,13 @@ class TaskMetadata(BaseModel):
501
503
 
502
504
  tags = ["mteb"] + self.modalities
503
505
 
504
- descriptive_stats = self.descriptive_stats
505
- if descriptive_stats is not None:
506
- for split, split_stat in descriptive_stats.items():
506
+ descriptive_stats = ""
507
+ if self.descriptive_stats is not None:
508
+ descriptive_stats_ = self.descriptive_stats
509
+ for split, split_stat in descriptive_stats_.items():
507
510
  if len(split_stat.get("hf_subset_descriptive_stats", {})) > 10:
508
511
  split_stat.pop("hf_subset_descriptive_stats", {})
509
- descriptive_stats = json.dumps(descriptive_stats, indent=4)
512
+ descriptive_stats = json.dumps(descriptive_stats_, indent=4)
510
513
 
511
514
  dataset_card_data_params = existing_dataset_card_data.to_dict()
512
515
  # override the existing values
@@ -583,10 +586,8 @@ class TaskMetadata(BaseModel):
583
586
 
584
587
  def _hf_subtypes(self) -> list[str]:
585
588
  # to get full list of available task_ids execute
586
- # requests.post("https://huggingface.co/api/validate-yaml", json={
587
- # "content": "---\ntask_ids: 'test'\n---",
588
- # "repoType": "dataset"
589
- # })
589
+ # https://huggingface.co/api/datasets-tags-by-type?type=task_ids
590
+ # ref https://huggingface-openapi.hf.space/#tag/datasets/GET/api/datasets-tags-by-type
590
591
  mteb_to_hf_subtype = {
591
592
  "Article retrieval": ["document-retrieval"],
592
593
  "Conversational retrieval": ["conversational", "utterance-retrieval"],
@@ -608,7 +609,7 @@ class TaskMetadata(BaseModel):
608
609
  "hate-speech-detection",
609
610
  ],
610
611
  "Thematic clustering": [],
611
- "Scientific Reranking": [],
612
+ "Scientific Reranking": ["text-scoring"],
612
613
  "Claim verification": ["fact-checking", "fact-checking-retrieval"],
613
614
  "Topic classification": ["topic-classification"],
614
615
  "Code retrieval": [],
@@ -616,21 +617,21 @@ class TaskMetadata(BaseModel):
616
617
  "Cross-Lingual Semantic Discrimination": [],
617
618
  "Textual Entailment": ["natural-language-inference"],
618
619
  "Counterfactual Detection": [],
619
- "Emotion classification": [],
620
+ "Emotion classification": ["sentiment-classification"],
620
621
  "Reasoning as Retrieval": [],
621
622
  "Rendered Texts Understanding": [],
622
623
  "Image Text Retrieval": [],
623
624
  "Object recognition": [],
624
625
  "Scene recognition": [],
625
626
  "Caption Pairing": ["image-captioning"],
626
- "Emotion recognition": [],
627
+ "Emotion recognition": ["sentiment-scoring"],
627
628
  "Textures recognition": [],
628
629
  "Activity recognition": [],
629
630
  "Tumor detection": [],
630
631
  "Duplicate Detection": [],
631
632
  "Rendered semantic textual similarity": [
632
633
  "semantic-similarity-scoring",
633
- "rendered semantic textual similarity",
634
+ "semantic-similarity-classification",
634
635
  ],
635
636
  "Intent classification": [
636
637
  "intent-classification",
@@ -644,10 +645,8 @@ class TaskMetadata(BaseModel):
644
645
 
645
646
  def _hf_task_type(self) -> list[str]:
646
647
  # to get full list of task_types execute:
647
- # requests.post("https://huggingface.co/api/validate-yaml", json={
648
- # "content": "---\ntask_categories: ['test']\n---", "repoType": "dataset"
649
- # }).json()
650
- # or look at https://huggingface.co/tasks
648
+ # https://huggingface.co/api/datasets-tags-by-type?type=task_categories
649
+ # ref https://huggingface-openapi.hf.space/#tag/datasets/GET/api/datasets-tags-by-type
651
650
  mteb_task_type_to_datasets = {
652
651
  # Text
653
652
  "BitextMining": ["translation"],
@@ -666,7 +665,7 @@ class TaskMetadata(BaseModel):
666
665
  "Any2AnyRetrieval": ["visual-document-retrieval"],
667
666
  "Any2AnyMultilingualRetrieval": ["visual-document-retrieval"],
668
667
  "VisionCentricQA": ["visual-question-answering"],
669
- "ImageClustering": ["image-clustering"],
668
+ "ImageClustering": ["image-feature-extraction"],
670
669
  "ImageClassification": ["image-classification"],
671
670
  "ImageMultilabelClassification": ["image-classification"],
672
671
  "DocumentUnderstanding": ["visual-document-retrieval"],
@@ -694,11 +693,11 @@ class TaskMetadata(BaseModel):
694
693
 
695
694
  def _hf_languages(self) -> list[str]:
696
695
  languages: list[str] = []
697
- if self.is_multilingual:
698
- for val in list(self.eval_langs.values()):
696
+ if self.is_multilingual and isinstance(self.eval_langs, dict):
697
+ for val in self.eval_langs.values():
699
698
  languages.extend(val)
700
699
  else:
701
- languages = self.eval_langs
700
+ languages = cast(list[str], self.eval_langs)
702
701
  # value "python" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters),
703
702
  # or a special value like "code", "multilingual".
704
703
  readme_langs = []
@@ -710,7 +709,7 @@ class TaskMetadata(BaseModel):
710
709
  readme_langs.append(lang_name)
711
710
  return sorted(set(readme_langs))
712
711
 
713
- def _hf_license(self) -> str:
712
+ def _hf_license(self) -> str | None:
714
713
  dataset_license = self.license
715
714
  if dataset_license:
716
715
  license_mapping = {
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
3
  from pathlib import Path
4
- from typing import Any, ClassVar, TypedDict
4
+ from typing import Any, ClassVar, TypedDict, cast
5
5
 
6
6
  from datasets import Dataset, DatasetDict
7
7
  from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
@@ -10,7 +10,7 @@ from mteb._evaluators import BitextMiningEvaluator
10
10
  from mteb.abstasks._statistics_calculation import calculate_text_statistics
11
11
  from mteb.abstasks.abstask import AbsTask
12
12
  from mteb.models import EncoderProtocol, MTEBModels
13
- from mteb.types import HFSubset, ScoresDict
13
+ from mteb.types import EncodeKwargs, HFSubset, ScoresDict
14
14
  from mteb.types.statistics import SplitDescriptiveStatistics, TextStatistics
15
15
 
16
16
  logger = logging.getLogger(__name__)
@@ -73,11 +73,14 @@ class AbsTaskBitextMining(AbsTask):
73
73
  split: str = "test",
74
74
  subsets_to_run: list[HFSubset] | None = None,
75
75
  *,
76
- encode_kwargs: dict[str, Any],
76
+ encode_kwargs: EncodeKwargs,
77
77
  prediction_folder: Path | None = None,
78
78
  **kwargs: Any,
79
79
  ) -> dict[HFSubset, ScoresDict]:
80
80
  """Added load for "parallel" datasets"""
81
+ if not isinstance(model, EncoderProtocol):
82
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
83
+
81
84
  if not self.data_loaded:
82
85
  self.load_data()
83
86
 
@@ -87,11 +90,16 @@ class AbsTaskBitextMining(AbsTask):
87
90
  if subsets_to_run is not None:
88
91
  hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
89
92
 
90
- scores = {}
93
+ encoder_model = cast(EncoderProtocol, model)
94
+
95
+ if self.dataset is None:
96
+ raise ValueError("Dataset is not loaded.")
97
+
98
+ scores: dict[str, BitextMiningMetrics] = {}
91
99
  if self.parallel_subsets:
92
- scores = self._evaluate_subset(
93
- model,
94
- self.dataset[split], # type: ignore
100
+ scores = self._evaluate_subset( # type: ignore[assignment]
101
+ encoder_model,
102
+ self.dataset[split],
95
103
  parallel=True,
96
104
  hf_split=split,
97
105
  hf_subset="parallel",
@@ -109,8 +117,8 @@ class AbsTaskBitextMining(AbsTask):
109
117
  data_split = self.dataset[split]
110
118
  else:
111
119
  data_split = self.dataset[hf_subset][split]
112
- scores[hf_subset] = self._evaluate_subset(
113
- model,
120
+ scores[hf_subset] = self._evaluate_subset( # type: ignore[assignment]
121
+ encoder_model,
114
122
  data_split,
115
123
  hf_split=split,
116
124
  hf_subset=hf_subset,
@@ -119,32 +127,32 @@ class AbsTaskBitextMining(AbsTask):
119
127
  **kwargs,
120
128
  )
121
129
 
122
- return scores
130
+ return cast(dict[HFSubset, ScoresDict], scores)
123
131
 
124
132
  def _get_pairs(self, parallel: bool) -> list[tuple[str, str]]:
125
133
  pairs = self._DEFAULT_PAIR
126
134
  if parallel:
127
- pairs = [langpair.split("-") for langpair in self.hf_subsets]
135
+ pairs = [langpair.split("-") for langpair in self.hf_subsets] # type: ignore[misc]
128
136
  return pairs
129
137
 
130
- def _evaluate_subset(
138
+ def _evaluate_subset( # type: ignore[override]
131
139
  self,
132
140
  model: EncoderProtocol,
133
141
  data_split: Dataset,
134
142
  *,
135
143
  hf_split: str,
136
144
  hf_subset: str,
137
- parallel: bool = False,
138
- encode_kwargs: dict[str, Any],
145
+ encode_kwargs: EncodeKwargs,
139
146
  prediction_folder: Path | None = None,
147
+ parallel: bool = False,
140
148
  **kwargs,
141
- ) -> ScoresDict:
149
+ ) -> BitextMiningMetrics | dict[str, BitextMiningMetrics]:
142
150
  pairs = self._get_pairs(parallel)
143
151
 
144
152
  evaluator = BitextMiningEvaluator(
145
153
  data_split,
146
154
  task_metadata=self.metadata,
147
- pair_columns=pairs, # type: ignore
155
+ pair_columns=pairs,
148
156
  hf_split=hf_split,
149
157
  hf_subset=hf_subset,
150
158
  **kwargs,
@@ -168,16 +176,16 @@ class AbsTaskBitextMining(AbsTask):
168
176
  )
169
177
 
170
178
  if parallel:
171
- metrics = {}
179
+ parallel_metrics = {}
172
180
  for keys, nearest_neighbors in neighbours.items():
173
- metrics[keys] = self._compute_metrics(nearest_neighbors, gold)
181
+ parallel_metrics[keys] = self._compute_metrics(nearest_neighbors, gold)
174
182
 
175
- for v in metrics.values():
183
+ for v in parallel_metrics.values():
176
184
  self._add_main_score(v)
177
- else:
178
- def_pair_str = "-".join(self._DEFAULT_PAIR[0])
179
- metrics = self._compute_metrics(neighbours[def_pair_str], gold)
180
- self._add_main_score(metrics)
185
+ return parallel_metrics
186
+ def_pair_str = "-".join(self._DEFAULT_PAIR[0])
187
+ metrics = self._compute_metrics(neighbours[def_pair_str], gold)
188
+ self._add_main_score(metrics)
181
189
  return metrics
182
190
 
183
191
  def _compute_metrics(
@@ -250,8 +258,11 @@ class AbsTaskBitextMining(AbsTask):
250
258
  )
251
259
 
252
260
  def _push_dataset_to_hub(self, repo_name: str) -> None:
261
+ if self.dataset is None:
262
+ raise ValueError("Dataset is not loaded.")
263
+
253
264
  if self.metadata.is_multilingual:
254
- dataset = defaultdict(dict)
265
+ dataset: dict[str, dict[str, list[str]]] = defaultdict(dict)
255
266
  for config in self.metadata.eval_langs:
256
267
  logger.info(f"Converting {config} of {self.metadata.name}")
257
268
 
@@ -266,10 +277,10 @@ class AbsTaskBitextMining(AbsTask):
266
277
  for split in self.dataset[config]:
267
278
  dataset[split][lang_1] = self.dataset[config][split][sent_1]
268
279
  dataset[split][lang_2] = self.dataset[config][split][sent_2]
269
- for split in dataset:
270
- dataset[split] = Dataset.from_dict(dataset[split])
271
- dataset = DatasetDict(dataset)
272
- dataset.push_to_hub(repo_name)
280
+ dataset_dict = DatasetDict(
281
+ {split: Dataset.from_dict(dataset[split]) for split in dataset}
282
+ )
283
+ dataset_dict.push_to_hub(repo_name)
273
284
  else:
274
285
  sentences = {}
275
286
  for split in self.dataset:
@@ -16,7 +16,7 @@ else:
16
16
 
17
17
  logger = logging.getLogger(__name__)
18
18
 
19
- OLD_FORMAT_RERANKING_TASKS = []
19
+ OLD_FORMAT_RERANKING_TASKS: list[str] = []
20
20
 
21
21
 
22
22
  @deprecated(
@@ -100,12 +100,14 @@ class AbsTaskReranking(AbsTaskRetrieval):
100
100
  if self.metadata.name not in OLD_FORMAT_RERANKING_TASKS:
101
101
  return
102
102
 
103
- logging.info(
103
+ logger.info(
104
104
  f"Transforming old format to standard format for {self.metadata.name}"
105
105
  )
106
106
 
107
107
  given_dataset = copy(given_dataset)
108
- self.dataset = defaultdict(lambda: defaultdict(dict))
108
+ self.dataset: dict[str, dict[str, RetrievalSplitData]] = defaultdict(
109
+ lambda: defaultdict(dict) # type: ignore[arg-type]
110
+ )
109
111
 
110
112
  hf_subsets = self.hf_subsets
111
113
 
@@ -115,19 +117,19 @@ class AbsTaskReranking(AbsTaskRetrieval):
115
117
  if hf_subset in cur_dataset:
116
118
  cur_dataset = cur_dataset[hf_subset]
117
119
  elif "name" in self.metadata.dataset:
118
- cur_dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore
120
+ cur_dataset = datasets.load_dataset(**self.metadata.dataset)
119
121
  assert hf_subset == "default", (
120
122
  f"Only default subset is supported for {self.metadata.name} since `name` is given in the metadata."
121
123
  )
122
124
  else:
123
125
  cur_dataset = datasets.load_dataset(
124
126
  **self.metadata.dataset, name=hf_subset
125
- ) # type: ignore
127
+ )
126
128
 
127
129
  for split in cur_dataset:
128
130
  corpus = []
129
131
  queries = []
130
- relevant_docs = defaultdict(dict)
132
+ relevant_docs: dict[str, dict[str, int]] = defaultdict(dict)
131
133
  top_ranked = defaultdict(list)
132
134
 
133
135
  # Create an enumerated dataset to pass indices
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  from pathlib import Path
3
- from typing import Any
4
3
 
5
4
  import numpy as np
6
5
  from datasets import Dataset
@@ -12,7 +11,8 @@ from mteb.abstasks._statistics_calculation import (
12
11
  calculate_text_statistics,
13
12
  )
14
13
  from mteb.abstasks.abstask import AbsTask
15
- from mteb.models import EncoderProtocol
14
+ from mteb.models import EncoderProtocol, MTEBModels
15
+ from mteb.types import EncodeKwargs
16
16
  from mteb.types.statistics import (
17
17
  ScoreStatistics,
18
18
  SplitDescriptiveStatistics,
@@ -77,17 +77,22 @@ class AbsTaskSummarization(AbsTask):
77
77
 
78
78
  def _evaluate_subset(
79
79
  self,
80
- model: EncoderProtocol,
80
+ model: MTEBModels,
81
81
  data_split: Dataset,
82
82
  *,
83
83
  hf_split: str,
84
84
  hf_subset: str,
85
- encode_kwargs: dict[str, Any],
85
+ encode_kwargs: EncodeKwargs,
86
86
  prediction_folder: Path | None = None,
87
87
  **kwargs,
88
88
  ) -> SummarizationMetrics:
89
+ if not isinstance(model, EncoderProtocol):
90
+ raise TypeError("Expected model to be an instance of EncoderProtocol")
91
+
89
92
  normalized_scores = [
90
- (np.array(x) - self.min_score) / (self.max_score - self.min_score)
93
+ (
94
+ (np.array(x) - self.min_score) / (self.max_score - self.min_score)
95
+ ).tolist()
91
96
  for x in data_split[self.relevancy_column_name]
92
97
  ]
93
98
  evaluator = self.evaluator(