mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,12 @@
1
1
  import logging
2
+ import warnings
3
+ from collections.abc import Sequence
2
4
  from pathlib import Path
3
5
 
4
6
  from huggingface_hub import ModelCard, ModelCardData, repo_exists
5
7
 
6
- from mteb import BenchmarkResults
7
8
  from mteb.abstasks.abstask import AbsTask
9
+ from mteb.benchmarks.benchmark import Benchmark
8
10
  from mteb.cache import ResultCache
9
11
 
10
12
  logger = logging.getLogger(__name__)
@@ -12,12 +14,13 @@ logger = logging.getLogger(__name__)
12
14
 
13
15
  def generate_model_card(
14
16
  model_name: str,
15
- tasks: list[AbsTask] | None = None,
17
+ tasks: Sequence[AbsTask] | None = None,
18
+ benchmarks: Sequence[Benchmark] | None = None,
16
19
  existing_model_card_id_or_path: str | Path | None = None,
17
20
  results_cache: ResultCache = ResultCache(),
18
21
  output_path: Path = Path("model_card.md"),
19
22
  add_table_to_model_card: bool = False,
20
- models_to_compare: list[str] | None = None,
23
+ models_to_compare: Sequence[str] | None = None,
21
24
  token: str | None = None,
22
25
  push_to_hub: bool = False,
23
26
  ) -> None:
@@ -26,6 +29,7 @@ def generate_model_card(
26
29
  Args:
27
30
  model_name: Name of the model.
28
31
  tasks: List of tasks to generate results for.
32
+ benchmarks: A Benchmark or list of benchmarks to generate results for.
29
33
  existing_model_card_id_or_path: Path or ID of an existing model card to update.
30
34
  results_cache: Instance of ResultCache to load results from.
31
35
  output_path: Path to save the generated model card.
@@ -39,16 +43,24 @@ def generate_model_card(
39
43
  if existing_model_card_id_or_path:
40
44
  existing_model_card = ModelCard.load(existing_model_card_id_or_path)
41
45
 
46
+ all_tasks: list[AbsTask] = []
47
+ if tasks is not None:
48
+ all_tasks.extend(tasks)
49
+
50
+ if benchmarks is not None:
51
+ for b in benchmarks:
52
+ all_tasks.extend(b.tasks)
53
+
42
54
  benchmark_results = results_cache.load_results(
43
- [model_name], tasks, only_main_score=True
55
+ [model_name], all_tasks if all_tasks else None, only_main_score=True
44
56
  )
45
57
  eval_results = []
46
58
  for models_results in benchmark_results.model_results:
47
59
  for task_result in models_results.task_results:
48
60
  eval_results.extend(task_result.get_hf_eval_results())
49
61
 
50
- existing_model_card_data = (
51
- existing_model_card.data if existing_model_card else ModelCardData()
62
+ existing_model_card_data: ModelCardData = (
63
+ existing_model_card.data if existing_model_card else ModelCardData() # type: ignore[assignment]
52
64
  )
53
65
 
54
66
  if existing_model_card_data.eval_results is None:
@@ -78,35 +90,43 @@ def generate_model_card(
78
90
  card_data=existing_model_card_data
79
91
  )
80
92
 
81
- if models_to_compare:
82
- benchmark_results = results_cache.load_results(
83
- [model_name, *models_to_compare], tasks, only_main_score=True
84
- )
85
-
86
93
  if add_table_to_model_card:
87
94
  existing_model_card = _add_table_to_model_card(
88
- benchmark_results, existing_model_card
95
+ results_cache,
96
+ existing_model_card,
97
+ (model_name, *models_to_compare) if models_to_compare else (model_name,),
98
+ benchmarks or [],
89
99
  )
90
100
 
91
- if push_to_hub:
101
+ if push_to_hub and existing_model_card_id_or_path:
102
+ existing_model_card_id_or_path = str(existing_model_card_id_or_path)
92
103
  if repo_exists(existing_model_card_id_or_path):
93
104
  existing_model_card.push_to_hub(existing_model_card_id_or_path, token=token)
94
105
  else:
95
- logger.warning(
96
- f"Repository {existing_model_card_id_or_path} does not exist on the Hub. Skipping push to hub."
97
- )
106
+ msg = f"Repository {existing_model_card_id_or_path} does not exist on the Hub. Skipping push to hub."
107
+ logger.warning(msg)
108
+ warnings.warn(msg)
98
109
  existing_model_card.save(output_path)
99
110
 
100
111
 
101
112
  def _add_table_to_model_card(
102
- results: BenchmarkResults, model_card: ModelCard
113
+ results_cache: ResultCache,
114
+ model_card: ModelCard,
115
+ models: Sequence[str],
116
+ benchmarks: Sequence[Benchmark],
103
117
  ) -> ModelCard:
104
118
  original_content = model_card.content
105
- results_df = results.to_dataframe()
106
- results_df = results_df.set_index("task_name")
107
- mteb_content = f"""
108
- # MTEB results
109
- {results_df.to_markdown()}
110
- """
119
+ mteb_content = "# MTEB Results\n\n"
120
+
121
+ for benchmark in benchmarks:
122
+ mteb_content += f"## Benchmark: {benchmark.name}\n\n"
123
+ benchmark_results = results_cache.load_results(
124
+ tasks=benchmark,
125
+ models=models,
126
+ only_main_score=True,
127
+ )
128
+ df_results = benchmark_results.get_benchmark_result()
129
+ mteb_content += df_results.to_markdown(index=True) + "\n\n"
130
+
111
131
  model_card.content = original_content + "\n\n" + mteb_content
112
132
  return model_card
@@ -5,39 +5,35 @@ import logging
5
5
  import os
6
6
  import sys
7
7
  import traceback
8
- from collections.abc import Iterable
8
+ import warnings
9
+ from collections.abc import Iterable, Sequence
9
10
  from copy import deepcopy
10
11
  from datetime import datetime
11
12
  from itertools import chain
12
13
  from pathlib import Path
13
14
  from time import time
14
- from typing import TYPE_CHECKING, Any
15
-
16
- from mteb.abstasks.task_metadata import TaskCategory, TaskType
17
- from mteb.models.get_model_meta import (
18
- _model_meta_from_cross_encoder,
19
- _model_meta_from_sentence_transformers,
20
- )
21
-
22
- if sys.version_info >= (3, 13):
23
- from warnings import deprecated
24
- else:
25
- from typing_extensions import deprecated
15
+ from typing import TYPE_CHECKING, Any, cast
26
16
 
27
17
  import datasets
28
18
 
29
19
  import mteb
30
20
  from mteb.abstasks import AbsTask
21
+ from mteb.abstasks.aggregated_task import AbsTaskAggregate
22
+ from mteb.abstasks.task_metadata import TaskCategory, TaskType
31
23
  from mteb.benchmarks import Benchmark
32
24
  from mteb.models import (
33
25
  CrossEncoderWrapper,
34
- EncoderProtocol,
35
26
  ModelMeta,
36
27
  MTEBModels,
37
28
  SentenceTransformerEncoderWrapper,
38
29
  )
39
30
  from mteb.results import TaskResult
40
- from mteb.types import ScoresDict
31
+ from mteb.types import EncodeKwargs, ScoresDict
32
+
33
+ if sys.version_info >= (3, 13):
34
+ from warnings import deprecated
35
+ else:
36
+ from typing_extensions import deprecated
41
37
 
42
38
  if TYPE_CHECKING:
43
39
  from sentence_transformers import CrossEncoder, SentenceTransformer
@@ -57,7 +53,7 @@ class MTEB:
57
53
  )
58
54
  def __init__(
59
55
  self,
60
- tasks: Iterable[AbsTask | Benchmark],
56
+ tasks: Iterable[AbsTask] | Iterable[Benchmark],
61
57
  *,
62
58
  err_logs_path: str = "error_logs.txt",
63
59
  ) -> None:
@@ -68,15 +64,14 @@ class MTEB:
68
64
  `mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)").
69
65
  err_logs_path: Path to save error logs.
70
66
  """
71
- from mteb.benchmarks import Benchmark
72
-
73
- self.tasks = list(tasks)
74
- if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark):
67
+ if isinstance(next(iter(tasks)), Benchmark):
75
68
  self.benchmarks = tasks
76
- self.tasks = list(chain.from_iterable(self.tasks))
69
+ self.tasks = list(chain.from_iterable(cast(Iterable[Benchmark], tasks)))
70
+ elif isinstance(next(iter(tasks)), AbsTask):
71
+ self.tasks = list(cast(Iterable[AbsTask], tasks))
77
72
 
78
73
  self.err_logs_path = Path(err_logs_path)
79
- self.last_evaluated_splits = {}
74
+ self._last_evaluated_splits: dict[str, list[str]] = {}
80
75
 
81
76
  @property
82
77
  def available_tasks(self) -> list[str]:
@@ -89,7 +84,7 @@ class MTEB:
89
84
  return sorted({x.metadata.type for x in self.tasks})
90
85
 
91
86
  @property
92
- def available_task_categories(self) -> set[TaskCategory]:
87
+ def available_task_categories(self) -> set[TaskCategory | None]:
93
88
  """Set of available task categories."""
94
89
  return {x.metadata.category for x in self.tasks}
95
90
 
@@ -179,7 +174,7 @@ class MTEB:
179
174
  split: str,
180
175
  subsets_to_run: list[str] | None = None,
181
176
  *,
182
- encode_kwargs: dict[str, Any],
177
+ encode_kwargs: EncodeKwargs,
183
178
  **kwargs: Any,
184
179
  ):
185
180
  tick = time()
@@ -236,13 +231,14 @@ class MTEB:
236
231
  merged_kg_co2_emissions = None
237
232
  if existing_kg_co2_emissions and new_kg_co2_emissions:
238
233
  merged_kg_co2_emissions = existing_kg_co2_emissions + new_kg_co2_emissions
234
+ existing_evaluation_time = existing_results.evaluation_time or 0
235
+ new_evaluation_time = new_results.evaluation_time or 0
239
236
  merged_results = TaskResult(
240
237
  dataset_revision=new_results.dataset_revision,
241
238
  task_name=new_results.task_name,
242
239
  mteb_version=new_results.mteb_version,
243
240
  scores=merged_scores,
244
- evaluation_time=existing_results.evaluation_time
245
- + new_results.evaluation_time,
241
+ evaluation_time=existing_evaluation_time + new_evaluation_time,
246
242
  kg_co2_emissions=merged_kg_co2_emissions,
247
243
  )
248
244
 
@@ -267,7 +263,7 @@ class MTEB:
267
263
  overwrite_results: bool = False,
268
264
  raise_error: bool = True,
269
265
  co2_tracker: bool = False,
270
- encode_kwargs: dict[str, Any] | None = None,
266
+ encode_kwargs: EncodeKwargs | None = None,
271
267
  **kwargs,
272
268
  ) -> list[TaskResult]:
273
269
  """Run the evaluation pipeline on the selected tasks.
@@ -311,13 +307,16 @@ class MTEB:
311
307
  elif verbosity == 3:
312
308
  datasets.logging.set_verbosity(logging.DEBUG)
313
309
 
314
- meta = self.create_model_meta(model)
315
- output_path = self._create_output_folder(meta, output_folder)
316
-
310
+ mteb_model: MTEBModels
317
311
  if isinstance(model, SentenceTransformer):
318
- model = SentenceTransformerEncoderWrapper(model)
312
+ mteb_model = SentenceTransformerEncoderWrapper(model)
319
313
  elif isinstance(model, CrossEncoder):
320
- model = CrossEncoderWrapper(model)
314
+ mteb_model = CrossEncoderWrapper(model)
315
+ else:
316
+ mteb_model = cast(MTEBModels, model)
317
+
318
+ meta = self.create_model_meta(mteb_model)
319
+ output_path = self._create_output_folder(meta, output_folder)
321
320
 
322
321
  # Disable co2_tracker for API models
323
322
  if "API" in meta.framework:
@@ -338,7 +337,7 @@ class MTEB:
338
337
  ) # save them in case we re-use the object (e.g. for reranking)
339
338
 
340
339
  # To evaluate missing splits, we keep track of the task name and the corresponding splits.
341
- self.last_evaluated_splits = {}
340
+ self._last_evaluated_splits = {}
342
341
 
343
342
  while len(self.tasks) > 0:
344
343
  task = self.tasks[0]
@@ -347,9 +346,10 @@ class MTEB:
347
346
  )
348
347
 
349
348
  if task.is_aggregate:
350
- self_ = MTEB(tasks=task.metadata.tasks)
351
- task_results = self_.run(
352
- model,
349
+ aggregated_task = cast(AbsTaskAggregate, task)
350
+ self_ = MTEB(tasks=aggregated_task.metadata.tasks)
351
+ aggregated_task_results = self_.run(
352
+ mteb_model,
353
353
  verbosity=verbosity - 1,
354
354
  output_folder=output_folder,
355
355
  eval_splits=eval_splits,
@@ -360,12 +360,15 @@ class MTEB:
360
360
  encode_kwargs=encode_kwargs,
361
361
  **kwargs,
362
362
  )
363
- new_results = task.combine_task_results(task_results)
363
+ new_results = aggregated_task.combine_task_results(
364
+ aggregated_task_results
365
+ )
364
366
  evaluation_results.append(new_results)
365
367
 
366
368
  if output_path:
367
- save_path = output_path / f"{task.metadata.name}.json"
368
- new_results.to_disk(save_path)
369
+ new_results.to_disk(
370
+ output_path / f"{aggregated_task.metadata.name}.json"
371
+ )
369
372
  del self.tasks[0]
370
373
  continue
371
374
 
@@ -387,7 +390,7 @@ class MTEB:
387
390
  task_subsets = task.hf_subsets
388
391
 
389
392
  existing_results = None
390
- save_path = None
393
+ save_path: Path | None = None
391
394
  final_splits_to_run = task_eval_splits
392
395
  missing_evaluations = self._get_missing_evaluations(
393
396
  existing_results,
@@ -437,7 +440,7 @@ class MTEB:
437
440
  logger.info(
438
441
  f"No splits to evaluate for {task.metadata.name}. Skipping evaluation."
439
442
  )
440
- self.last_evaluated_splits[task.metadata.name] = []
443
+ self._last_evaluated_splits[task.metadata.name] = []
441
444
  del self.tasks[0]
442
445
  continue
443
446
 
@@ -445,11 +448,11 @@ class MTEB:
445
448
  task.check_if_dataset_is_superseded()
446
449
  task.load_data()
447
450
 
448
- task_results = {}
451
+ task_results: dict[str, dict[str, dict[str, Any]]] = {}
449
452
  evaluation_time = 0
450
453
  kg_co2_emissions: int | None = 0 if co2_tracker else None
451
454
 
452
- self.last_evaluated_splits[task.metadata.name] = []
455
+ self._last_evaluated_splits[task.metadata.name] = []
453
456
 
454
457
  for split in final_splits_to_run:
455
458
  info = missing_evaluations[split]
@@ -470,14 +473,16 @@ class MTEB:
470
473
 
471
474
  if co2_tracker:
472
475
  try:
473
- from codecarbon import EmissionsTracker
476
+ from codecarbon import ( # type: ignore[import-not-found,import-untyped]
477
+ EmissionsTracker,
478
+ )
474
479
  except ImportError:
475
480
  raise ImportError(
476
481
  "codecarbon is not installed. Please install it using `pip install 'mteb[codecarbon]'` to track CO₂ emissions."
477
482
  )
478
- logger.warning(
479
- "Evaluating multiple MTEB runs simultaneously will produce incorrect CO₂ results"
480
- )
483
+ msg = "Evaluating multiple MTEB runs simultaneously will produce incorrect CO₂ results"
484
+ logger.warning(msg)
485
+ warnings.warn(msg)
481
486
  with EmissionsTracker(
482
487
  save_to_file=False,
483
488
  save_to_api=False,
@@ -486,7 +491,7 @@ class MTEB:
486
491
  ) as tracker:
487
492
  results, tick, tock = self._run_eval(
488
493
  task,
489
- model,
494
+ mteb_model,
490
495
  split,
491
496
  encode_kwargs=encode_kwargs,
492
497
  subsets_to_run=subsets_to_run,
@@ -499,7 +504,7 @@ class MTEB:
499
504
  else:
500
505
  results, tick, tock = self._run_eval(
501
506
  task,
502
- model,
507
+ mteb_model,
503
508
  split,
504
509
  subsets_to_run=subsets_to_run,
505
510
  encode_kwargs=encode_kwargs,
@@ -515,25 +520,25 @@ class MTEB:
515
520
  if verbosity >= 1:
516
521
  logger.info(f"Scores: {task_results[split]}")
517
522
 
518
- self.last_evaluated_splits[task.metadata.name].append(split)
523
+ self._last_evaluated_splits[task.metadata.name].append(split)
519
524
 
520
525
  # Create new TaskResult
521
526
  new_results = TaskResult.from_task_results(
522
527
  task,
523
- task_results,
528
+ task_results, # type: ignore[arg-type]
524
529
  evaluation_time=evaluation_time,
525
530
  kg_co2_emissions=kg_co2_emissions,
526
531
  )
527
532
 
528
533
  # Merge with existing if needed
529
- if output_path and save_path.exists():
534
+ if output_path and save_path and save_path.exists():
530
535
  existing_results = TaskResult.from_disk(save_path)
531
536
  if existing_results:
532
537
  merged_results = self._merge_results(existing_results, new_results)
533
538
  else:
534
539
  merged_results = new_results
535
540
 
536
- if output_path:
541
+ if output_path and save_path:
537
542
  merged_results.to_disk(save_path)
538
543
 
539
544
  evaluation_results.append(merged_results)
@@ -560,7 +565,7 @@ class MTEB:
560
565
  def create_model_meta(model: MTEBModels) -> ModelMeta:
561
566
  """Create a ModelMeta object for the given model."""
562
567
  if hasattr(model, "mteb_model_meta") and model.mteb_model_meta is not None:
563
- meta = model.mteb_model_meta # type: ignore
568
+ meta = model.mteb_model_meta
564
569
  else:
565
570
  meta = MTEB._get_model_meta(model)
566
571
 
@@ -586,7 +591,11 @@ class MTEB:
586
591
  if output_folder is None:
587
592
  return None
588
593
 
589
- model_revision: str = model_meta.revision # type: ignore
594
+ model_revision: str = (
595
+ model_meta.revision
596
+ if model_meta.revision is not None
597
+ else "no_revision_available"
598
+ )
590
599
  model_path_name = model_meta.model_name_as_path()
591
600
 
592
601
  output_path = Path(output_folder) / model_path_name / model_revision
@@ -608,15 +617,15 @@ class MTEB:
608
617
  Tasks with empty lists indicate that results already existed and no splits were evaluated.
609
618
  """
610
619
  return deepcopy(
611
- {task: list(splits) for task, splits in self.last_evaluated_splits.items()}
620
+ {task: list(splits) for task, splits in self._last_evaluated_splits.items()}
612
621
  )
613
622
 
614
623
  @staticmethod
615
624
  def _get_missing_evaluations(
616
625
  existing_results: TaskResult | None,
617
- task_eval_splits: list[str],
618
- task_eval_langs: list[str],
619
- eval_subsets: list[str] | None,
626
+ task_eval_splits: Sequence[str],
627
+ task_eval_langs: Sequence[str],
628
+ eval_subsets: Sequence[str] | None,
620
629
  ) -> dict[str, dict[str, Any]]:
621
630
  """Return a dictionary for each split, indicating if the whole split is missing and which subsets are missing."""
622
631
  missing_evaluations = {
@@ -665,13 +674,13 @@ class MTEB:
665
674
  return missing_evaluations
666
675
 
667
676
  @staticmethod
668
- def _get_model_meta(model: EncoderProtocol) -> ModelMeta:
677
+ def _get_model_meta(model: MTEBModels) -> ModelMeta:
669
678
  from sentence_transformers import CrossEncoder, SentenceTransformer
670
679
 
671
680
  if isinstance(model, CrossEncoder):
672
- meta = _model_meta_from_cross_encoder(model)
681
+ meta = ModelMeta.from_cross_encoder(model)
673
682
  elif isinstance(model, SentenceTransformer):
674
- meta = _model_meta_from_sentence_transformers(model)
683
+ meta = ModelMeta.from_sentence_transformer_model(model)
675
684
  else:
676
685
  meta = ModelMeta(
677
686
  loader=None,
@@ -0,0 +1,61 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 19928,
4
+ "number_of_characters": 35466331,
5
+ "unique_pairs": 19928,
6
+ "sentence1_statistics": {
7
+ "total_text_length": 17733346,
8
+ "min_text_length": 103,
9
+ "average_text_length": 889.8708350060217,
10
+ "max_text_length": 11576,
11
+ "unique_texts": 19928
12
+ },
13
+ "sentence2_statistics": {
14
+ "total_text_length": 17732985,
15
+ "min_text_length": 103,
16
+ "average_text_length": 889.8527197912485,
17
+ "max_text_length": 11576,
18
+ "unique_texts": 19928
19
+ },
20
+ "hf_subset_descriptive_stats": {
21
+ "ru-en": {
22
+ "num_samples": 9965,
23
+ "number_of_characters": 17734926,
24
+ "unique_pairs": 9965,
25
+ "sentence1_statistics": {
26
+ "total_text_length": 8685585,
27
+ "min_text_length": 103,
28
+ "average_text_length": 871.6091319618665,
29
+ "max_text_length": 5675,
30
+ "unique_texts": 9965
31
+ },
32
+ "sentence2_statistics": {
33
+ "total_text_length": 9049341,
34
+ "min_text_length": 106,
35
+ "average_text_length": 908.1124937280482,
36
+ "max_text_length": 11576,
37
+ "unique_texts": 9965
38
+ }
39
+ },
40
+ "en-ru": {
41
+ "num_samples": 9963,
42
+ "number_of_characters": 17731405,
43
+ "unique_pairs": 9963,
44
+ "sentence1_statistics": {
45
+ "total_text_length": 9047761,
46
+ "min_text_length": 106,
47
+ "average_text_length": 908.1362039546322,
48
+ "max_text_length": 11576,
49
+ "unique_texts": 9963
50
+ },
51
+ "sentence2_statistics": {
52
+ "total_text_length": 8683644,
53
+ "min_text_length": 103,
54
+ "average_text_length": 871.5892803372478,
55
+ "max_text_length": 5675,
56
+ "unique_texts": 9963
57
+ }
58
+ }
59
+ }
60
+ }
61
+ }
@@ -0,0 +1,60 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1085,
4
+ "number_texts_intersect_with_train": 0,
5
+ "text_statistics": {
6
+ "total_text_length": 115359,
7
+ "min_text_length": 8,
8
+ "average_text_length": 106.32165898617511,
9
+ "max_text_length": 2722,
10
+ "unique_texts": 1085
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 3,
18
+ "labels": {
19
+ "0": {
20
+ "count": 868
21
+ },
22
+ "1": {
23
+ "count": 190
24
+ },
25
+ "2": {
26
+ "count": 27
27
+ }
28
+ }
29
+ }
30
+ },
31
+ "train": {
32
+ "num_samples": 7176,
33
+ "number_texts_intersect_with_train": null,
34
+ "text_statistics": {
35
+ "total_text_length": 830248,
36
+ "min_text_length": 5,
37
+ "average_text_length": 115.69788182831661,
38
+ "max_text_length": 4759,
39
+ "unique_texts": 7176
40
+ },
41
+ "image_statistics": null,
42
+ "label_statistics": {
43
+ "min_labels_per_text": 1,
44
+ "average_label_per_text": 1.0,
45
+ "max_labels_per_text": 1,
46
+ "unique_labels": 3,
47
+ "labels": {
48
+ "0": {
49
+ "count": 4933
50
+ },
51
+ "1": {
52
+ "count": 2047
53
+ },
54
+ "2": {
55
+ "count": 196
56
+ }
57
+ }
58
+ }
59
+ }
60
+ }
@@ -0,0 +1,54 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 193,
4
+ "number_texts_intersect_with_train": 0,
5
+ "text_statistics": {
6
+ "total_text_length": 1543015,
7
+ "min_text_length": 492,
8
+ "average_text_length": 7994.896373056995,
9
+ "max_text_length": 49510,
10
+ "unique_texts": 193
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 2,
18
+ "labels": {
19
+ "1": {
20
+ "count": 177
21
+ },
22
+ "0": {
23
+ "count": 16
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "train": {
29
+ "num_samples": 870,
30
+ "number_texts_intersect_with_train": null,
31
+ "text_statistics": {
32
+ "total_text_length": 6968132,
33
+ "min_text_length": 259,
34
+ "average_text_length": 8009.347126436782,
35
+ "max_text_length": 74490,
36
+ "unique_texts": 870
37
+ },
38
+ "image_statistics": null,
39
+ "label_statistics": {
40
+ "min_labels_per_text": 1,
41
+ "average_label_per_text": 1.0,
42
+ "max_labels_per_text": 1,
43
+ "unique_labels": 2,
44
+ "labels": {
45
+ "1": {
46
+ "count": 755
47
+ },
48
+ "0": {
49
+ "count": 115
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1299,
4
+ "number_of_characters": 9254,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 2245,
8
+ "average_image_width": 2370.324347826087,
9
+ "max_image_width": 3508,
10
+ "min_image_height": 2481,
11
+ "average_image_height": 3289.8060869565215,
12
+ "max_image_height": 3580,
13
+ "unique_images": 1132
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 9254,
17
+ "min_text_length": 15,
18
+ "average_text_length": 62.10738255033557,
19
+ "max_text_length": 108,
20
+ "unique_texts": 149
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 409,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 2.7449664429530203,
27
+ "max_relevant_docs_per_query": 7,
28
+ "unique_relevant_docs": 316
29
+ },
30
+ "top_ranked_statistics": null
31
+ }
32
+ }