mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,327 @@
1
+ from __future__ import annotations
2
+
3
+ import atexit
4
+ import gc
5
+ import logging
6
+ import os
7
+ from collections.abc import Callable
8
+ from typing import TYPE_CHECKING, Any, Literal
9
+
10
+ import numpy as np
11
+ import torch
12
+ from torch.utils.data import DataLoader
13
+
14
+ from mteb._requires_package import requires_package
15
+ from mteb.abstasks.task_metadata import TaskMetadata
16
+ from mteb.models import ModelMeta
17
+ from mteb.models.abs_encoder import AbsEncoder
18
+ from mteb.types import Array, BatchedInput, PromptType
19
+
20
+ if TYPE_CHECKING:
21
+ from vllm.config import PoolerConfig # type: ignore[import-not-found]
22
+ else:
23
+ PoolerConfig = dict[str, Any]
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ Dtype = Literal["half", "float16", "float", "float32", "bfloat16", "auto"]
28
+
29
+
30
+ class VllmWrapperBase:
31
+ """Wrapper for vllm serving engine."""
32
+
33
+ convert = "auto"
34
+ mteb_model_meta: ModelMeta | None = None
35
+
36
+ def __init__(
37
+ self,
38
+ model: str | ModelMeta,
39
+ revision: str | None = None,
40
+ *,
41
+ trust_remote_code: bool = True,
42
+ dtype: Dtype = "auto",
43
+ head_dtype: Literal["model"] | Dtype | None = None,
44
+ max_model_len: int | None = None,
45
+ max_num_batched_tokens: int | None = None,
46
+ max_num_seqs: int = 128,
47
+ tensor_parallel_size: int = 1,
48
+ enable_prefix_caching: bool | None = None,
49
+ gpu_memory_utilization: float = 0.9,
50
+ hf_overrides: dict[str, Any] | None = None,
51
+ pooler_config: PoolerConfig | None = None,
52
+ enforce_eager: bool = False,
53
+ **kwargs: Any,
54
+ ):
55
+ """Wrapper for vllm serving engine.
56
+
57
+ Args:
58
+ model: model name string.
59
+ revision: The revision of the model to use.
60
+ trust_remote_code: Whether to trust remote code execution when loading the model.
61
+ Should be True for models with custom code.
62
+ dtype: Data type for model weights. "auto" will automatically select appropriate
63
+ dtype based on hardware and model capabilities. vllm uses flash attention by
64
+ default, which does not support fp32. Therefore, it defaults to using fp16 for
65
+ inference on fp32 models. Testing has shown a relatively small drop in accuracy.
66
+ You can manually opt for fp32, but inference speed will be very slow.
67
+ head_dtype: "head" refers to the last Linear layer(s) of an LLMs, such as the score
68
+ or classifier in a classification model. Uses fp32 for the head by default to
69
+ gain extra precision.
70
+ max_model_len: Maximum sequence length (context window) supported by the model.
71
+ If None, uses the model's default maximum length.
72
+ max_num_batched_tokens: Maximum number of tokens to process in a single batch.
73
+ If None, automatically determined.
74
+ max_num_seqs: Maximum number of sequences to process concurrently.
75
+ tensor_parallel_size: Number of GPUs for tensor parallelism.
76
+ enable_prefix_caching: Whether to enable KV cache sharing for common prompt prefixes.
77
+ If None, uses the model's default setting.
78
+ gpu_memory_utilization: Target GPU memory utilization ratio (0.0 to 1.0).
79
+ hf_overrides: Dictionary mapping Hugging Face configuration keys to override values.
80
+ pooler_config: Controls the behavior of output pooling in pooling models.
81
+ enforce_eager: Whether to disable CUDA graph optimization and use eager execution.
82
+ **kwargs: Additional arguments to pass to the vllm serving engine model.
83
+ """
84
+ requires_package(
85
+ self,
86
+ "vllm",
87
+ "Wrapper for vllm serving engine",
88
+ install_instruction="pip install mteb[vllm]",
89
+ )
90
+
91
+ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
92
+
93
+ from vllm import LLM, EngineArgs
94
+
95
+ hf_overrides = {} if hf_overrides is None else hf_overrides
96
+
97
+ if head_dtype is not None:
98
+ hf_overrides["head_dtype"] = head_dtype
99
+
100
+ model_name = model if isinstance(model, str) else model.name
101
+
102
+ if isinstance(model, ModelMeta):
103
+ logger.info(
104
+ "Using revision from model meta. Passed revision will be ignored"
105
+ )
106
+ revision = model.revision
107
+
108
+ args = EngineArgs(
109
+ model=model_name,
110
+ revision=revision,
111
+ runner="pooling",
112
+ convert=self.convert, # type: ignore[arg-type]
113
+ max_model_len=max_model_len,
114
+ max_num_batched_tokens=max_num_batched_tokens,
115
+ max_num_seqs=max_num_seqs,
116
+ tensor_parallel_size=tensor_parallel_size,
117
+ enable_prefix_caching=enable_prefix_caching,
118
+ gpu_memory_utilization=gpu_memory_utilization,
119
+ hf_overrides=hf_overrides,
120
+ pooler_config=pooler_config,
121
+ enforce_eager=enforce_eager,
122
+ trust_remote_code=trust_remote_code,
123
+ dtype=dtype,
124
+ **kwargs,
125
+ )
126
+ self.llm = LLM(**vars(args))
127
+
128
+ if isinstance(model, str):
129
+ self.mteb_model_meta = ModelMeta.from_hub(model=model, revision=revision)
130
+ else:
131
+ self.mteb_model_meta = model
132
+
133
+ atexit.register(self.cleanup)
134
+
135
+ def cleanup(self):
136
+ """Clean up the VLLM distributed runtime environment and release GPU resources."""
137
+ if self.llm is None:
138
+ return
139
+
140
+ from vllm.distributed import ( # type: ignore[import-not-found]
141
+ cleanup_dist_env_and_memory,
142
+ )
143
+
144
+ self.llm = None
145
+ gc.collect()
146
+ cleanup_dist_env_and_memory()
147
+
148
+ def __del__(self):
149
+ try:
150
+ self.cleanup()
151
+ except Exception:
152
+ pass
153
+
154
+
155
+ class VllmEncoderWrapper(AbsEncoder, VllmWrapperBase):
156
+ """vLLM wrapper for Encoder models.
157
+
158
+ Args:
159
+ model: model name string or ModelMeta.
160
+ revision: The revision of the model to use.
161
+ prompt_dict: A dictionary mapping task names to prompt strings.
162
+ use_instructions: Whether to use instructions from the prompt_dict.
163
+ When False, values from prompt_dict are used as static prompts (prefixes).
164
+ When True, values from prompt_dict are used as instructions to be formatted
165
+ using the instruction_template.
166
+ instruction_template: A template or callable to format instructions.
167
+ Can be a string with '{instruction}' placeholder or a callable that takes
168
+ the instruction and prompt type and returns a formatted string.
169
+ apply_instruction_to_documents: Whether to apply instructions to documents prompts.
170
+ **kwargs: Additional arguments to pass to the vllm serving engine model.
171
+ """
172
+
173
+ convert = "embed"
174
+
175
+ def __init__(
176
+ self,
177
+ model: str | ModelMeta,
178
+ revision: str | None = None,
179
+ prompt_dict: dict[str, str] | None = None,
180
+ use_instructions: bool = False,
181
+ instruction_template: (
182
+ str | Callable[[str, PromptType | None], str] | None
183
+ ) = None,
184
+ apply_instruction_to_documents: bool = True,
185
+ **kwargs: Any,
186
+ ):
187
+ if use_instructions and instruction_template is None:
188
+ raise ValueError(
189
+ "To use instructions, an instruction_template must be provided. "
190
+ "For example, `Instruction: {instruction}`"
191
+ )
192
+
193
+ if (
194
+ isinstance(instruction_template, str)
195
+ and "{instruction}" not in instruction_template
196
+ ):
197
+ raise ValueError(
198
+ "Instruction template must contain the string '{instruction}'."
199
+ )
200
+
201
+ self.prompts_dict = prompt_dict
202
+ self.use_instructions = use_instructions
203
+ self.instruction_template = instruction_template
204
+ self.apply_instruction_to_passages = apply_instruction_to_documents
205
+ super().__init__(
206
+ model,
207
+ revision,
208
+ **kwargs,
209
+ )
210
+
211
+ def encode(
212
+ self,
213
+ inputs: DataLoader[BatchedInput],
214
+ *,
215
+ task_metadata: TaskMetadata,
216
+ hf_split: str,
217
+ hf_subset: str,
218
+ prompt_type: PromptType | None = None,
219
+ **kwargs: Any,
220
+ ) -> Array:
221
+ """Encodes the given sentences using the encoder.
222
+
223
+ Args:
224
+ inputs: The sentences to encode.
225
+ task_metadata: The metadata of the task. Sentence-transformers uses this to
226
+ determine which prompt to use from a specified dictionary.
227
+ prompt_type: The name type of prompt. (query or passage)
228
+ hf_split: Split of current task
229
+ hf_subset: Subset of current task
230
+ **kwargs: Additional arguments to pass to the encoder.
231
+
232
+ Returns:
233
+ The encoded sentences.
234
+ """
235
+ prompt = ""
236
+ if self.use_instructions and self.prompts_dict is not None:
237
+ prompt = self.get_task_instruction(task_metadata, prompt_type)
238
+ elif self.prompts_dict is not None:
239
+ prompt_name = self.get_prompt_name(task_metadata, prompt_type)
240
+ if prompt_name is not None:
241
+ prompt = self.prompts_dict.get(prompt_name, "")
242
+
243
+ if (
244
+ self.use_instructions
245
+ and self.apply_instruction_to_passages is False
246
+ and prompt_type == PromptType.document
247
+ ):
248
+ logger.info(
249
+ f"No instruction used, because prompt type = {prompt_type.document}"
250
+ )
251
+ prompt = ""
252
+ else:
253
+ logger.info(
254
+ f"Using instruction: '{prompt}' for task: '{task_metadata.name}' prompt type: '{prompt_type}'"
255
+ )
256
+
257
+ prompts = [prompt + text for batch in inputs for text in batch["text"]]
258
+ outputs = self.llm.encode(
259
+ prompts, pooling_task="embed", truncate_prompt_tokens=-1
260
+ )
261
+ embeddings = torch.stack([output.outputs.data for output in outputs])
262
+ return embeddings
263
+
264
+
265
+ class VllmCrossEncoderWrapper(VllmWrapperBase):
266
+ """vLLM wrapper for CrossEncoder models."""
267
+
268
+ convert = "classify"
269
+
270
+ def __init__(
271
+ self,
272
+ model: str | ModelMeta,
273
+ revision: str | None = None,
274
+ query_prefix: str = "",
275
+ document_prefix: str = "",
276
+ **kwargs: Any,
277
+ ):
278
+ super().__init__(
279
+ model,
280
+ revision,
281
+ **kwargs,
282
+ )
283
+ self.query_prefix = query_prefix
284
+ self.document_prefix = document_prefix
285
+
286
+ def predict(
287
+ self,
288
+ inputs1: DataLoader[BatchedInput],
289
+ inputs2: DataLoader[BatchedInput],
290
+ *,
291
+ task_metadata: TaskMetadata,
292
+ hf_split: str,
293
+ hf_subset: str,
294
+ prompt_type: PromptType | None = None,
295
+ **kwargs: Any,
296
+ ) -> Array:
297
+ """Predicts relevance scores for pairs of inputs. Note that, unlike the encoder, the cross-encoder can compare across inputs.
298
+
299
+ Args:
300
+ inputs1: First Dataloader of inputs to encode. For reranking tasks, these are queries (for text only tasks `QueryDatasetType`).
301
+ inputs2: Second Dataloader of inputs to encode. For reranking, these are documents (for text only tasks `RetrievalOutputType`).
302
+ task_metadata: Metadata of the current task.
303
+ hf_split: Split of current task, allows to know some additional information about current split.
304
+ E.g. Current language
305
+ hf_subset: Subset of current task. Similar to `hf_split` to get more information
306
+ prompt_type: The name type of prompt. (query or passage)
307
+ **kwargs: Additional arguments to pass to the cross-encoder.
308
+
309
+ Returns:
310
+ The predicted relevance scores for each inputs pair.
311
+ """
312
+ queries = [
313
+ self.query_prefix + text for batch in inputs1 for text in batch["text"]
314
+ ]
315
+ corpus = [
316
+ self.document_prefix + text for batch in inputs2 for text in batch["text"]
317
+ ]
318
+ # TODO: support score prompt
319
+
320
+ outputs = self.llm.score(
321
+ queries,
322
+ corpus,
323
+ truncate_prompt_tokens=-1,
324
+ use_tqdm=False,
325
+ )
326
+ scores = np.array([output.outputs.score for output in outputs])
327
+ return scores
mteb/py.typed ADDED
File without changes
@@ -1,9 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ import functools
1
4
  import json
2
5
  import logging
3
6
  import warnings
4
- from collections.abc import Callable, Iterable, Iterator, Sequence
7
+ from collections.abc import Callable, Iterable, Iterator
5
8
  from pathlib import Path
6
- from typing import Any, Literal
9
+ from typing import Any, Literal, cast
7
10
 
8
11
  import pandas as pd
9
12
  from packaging.version import InvalidVersion, Version
@@ -15,6 +18,7 @@ from mteb.abstasks.task_metadata import (
15
18
  TaskDomain,
16
19
  TaskType,
17
20
  )
21
+ from mteb.benchmarks.benchmark import Benchmark
18
22
  from mteb.models import ModelMeta
19
23
  from mteb.models.get_model_meta import get_model_metas
20
24
  from mteb.types import (
@@ -31,6 +35,25 @@ from .model_result import ModelResult, _aggregate_and_pivot
31
35
  logger = logging.getLogger(__name__)
32
36
 
33
37
 
38
+ @functools.lru_cache
39
+ def _get_cached_model_metas() -> dict[str, str | None]:
40
+ """Cache model metas to avoid repeated calls."""
41
+ return {
42
+ meta.name: meta.revision for meta in get_model_metas() if meta.name is not None
43
+ }
44
+
45
+
46
+ @functools.lru_cache(maxsize=10000)
47
+ def _parse_version_cached(version_str: str | None) -> Version | None:
48
+ """Cache version parsing to avoid repeated parsing."""
49
+ if version_str is None:
50
+ return None
51
+ try:
52
+ return Version(version_str)
53
+ except (InvalidVersion, TypeError):
54
+ return None
55
+
56
+
34
57
  class BenchmarkResults(BaseModel):
35
58
  """Data class to hold the benchmark results of a model.
36
59
 
@@ -39,10 +62,10 @@ class BenchmarkResults(BaseModel):
39
62
  """
40
63
 
41
64
  model_results: list[ModelResult]
42
- model_config = (
43
- ConfigDict( # to free up the name model_results which is otherwise protected
44
- protected_namespaces=(),
45
- )
65
+ benchmark: Benchmark | None = None
66
+ model_config = ConfigDict(
67
+ protected_namespaces=(), # to free up the name model_results which is otherwise protected
68
+ arbitrary_types_allowed=True, # Benchmark is dataclasses.dataclass
46
69
  )
47
70
 
48
71
  def __repr__(self) -> str:
@@ -57,10 +80,10 @@ class BenchmarkResults(BaseModel):
57
80
  task_names: list[str] | None = None,
58
81
  languages: list[str] | None = None,
59
82
  domains: list[TaskDomain] | None = None,
60
- task_types: list[TaskType] | None = None, # type: ignore
83
+ task_types: list[TaskType] | None = None,
61
84
  modalities: list[Modalities] | None = None,
62
85
  is_public: bool | None = None,
63
- ) -> Self:
86
+ ) -> BenchmarkResults:
64
87
  # TODO: Same as filter_models
65
88
  model_results = [
66
89
  res._filter_tasks(
@@ -77,7 +100,7 @@ class BenchmarkResults(BaseModel):
77
100
  model_results=[res for res in model_results if res.task_results]
78
101
  )
79
102
 
80
- def select_tasks(self, tasks: Sequence[AbsTask]) -> Self:
103
+ def select_tasks(self, tasks: Iterable[AbsTask]) -> BenchmarkResults:
81
104
  """Select tasks from the benchmark results.
82
105
 
83
106
  Args:
@@ -95,7 +118,7 @@ class BenchmarkResults(BaseModel):
95
118
  self,
96
119
  names: list[str] | list[ModelMeta],
97
120
  revisions: list[str | None] | None = None,
98
- ) -> Self:
121
+ ) -> BenchmarkResults:
99
122
  """Get models by name and revision.
100
123
 
101
124
  Args:
@@ -108,7 +131,7 @@ class BenchmarkResults(BaseModel):
108
131
  models_res = []
109
132
  _revisions = revisions if revisions is not None else [None] * len(names)
110
133
 
111
- name_rev = {}
134
+ name_rev: dict[str, str | None] = {}
112
135
 
113
136
  if len(names) != len(_revisions):
114
137
  raise ValueError(
@@ -117,9 +140,12 @@ class BenchmarkResults(BaseModel):
117
140
 
118
141
  for name, revision in zip(names, _revisions):
119
142
  if isinstance(name, ModelMeta):
143
+ if name.name is None:
144
+ raise ValueError("name in ModelMeta is None. It must be a string.")
120
145
  name_rev[name.name] = name.revision
121
146
  else:
122
- name_rev[name] = revision
147
+ name_ = cast(str, name)
148
+ name_rev[name_] = revision
123
149
 
124
150
  for model_res in self.model_results:
125
151
  model_name = model_res.model_name
@@ -139,7 +165,7 @@ class BenchmarkResults(BaseModel):
139
165
  n_parameters_range: tuple[int | None, int | None] = (None, None),
140
166
  use_instructions: bool | None = None,
141
167
  zero_shot_on: list[AbsTask] | None = None,
142
- ) -> Self:
168
+ ) -> BenchmarkResults:
143
169
  # mostly a utility function for the leaderboard app.
144
170
  # I would probably move the filtering of the models outside of this call. No need to call get_model_metas inside the filter.
145
171
  # interface would then be the same as the get_models function
@@ -162,7 +188,7 @@ class BenchmarkResults(BaseModel):
162
188
 
163
189
  return type(self).model_construct(model_results=new_model_results)
164
190
 
165
- def join_revisions(self) -> Self:
191
+ def join_revisions(self) -> BenchmarkResults:
166
192
  """Join revisions of the same model.
167
193
 
168
194
  In case of conflicts, the following rules are applied:
@@ -173,40 +199,6 @@ class BenchmarkResults(BaseModel):
173
199
  Returns:
174
200
  A new BenchmarkResults object with the revisions joined.
175
201
  """
176
-
177
- def parse_version(version_str: str) -> Version | None:
178
- try:
179
- return Version(version_str)
180
- except (InvalidVersion, TypeError):
181
- return None
182
-
183
- def keep_best(group: pd.DataFrame) -> pd.DataFrame:
184
- # Filtering out task_results where no scores are present
185
- group = group[group["has_scores"]]
186
- is_main_revision = group["revision"] == group["main_revision"]
187
- # If the main revision is present we select that
188
- if is_main_revision.sum() > 0:
189
- return group[is_main_revision].head(n=1)
190
- unique_revisions = group["revision"].unique()
191
-
192
- # ensure None/NA/"external" revisions is filtered out
193
- group.loc[group["revision"].isna(), "revision"] = "no_revision_available"
194
- group.loc[group["revision"] == "external", "revision"] = (
195
- "no_revision_available"
196
- )
197
-
198
- # Filtering out no_revision_available if other revisions are present
199
- if (len(unique_revisions) > 1) and (
200
- "no_revision_available" in unique_revisions
201
- ):
202
- group = group[group["revision"] != "no_revision_available"]
203
- # If there are any not-NA mteb versions, we select the latest one
204
- if group["mteb_version"].notna().any():
205
- group = group.dropna(subset=["mteb_version"])
206
- group = group.sort_values("mteb_version", ascending=False)
207
- return group.head(n=1)
208
- return group.head(n=1)
209
-
210
202
  records = []
211
203
  for model_result in self:
212
204
  for task_result in model_result.task_results:
@@ -223,21 +215,58 @@ class BenchmarkResults(BaseModel):
223
215
  if not records:
224
216
  return BenchmarkResults.model_construct(model_results=[])
225
217
  task_df = pd.DataFrame.from_records(records)
226
- model_to_main_revision = {
227
- meta.name: meta.revision for meta in get_model_metas()
228
- }
229
- task_df["main_revision"] = task_df["model"].map(model_to_main_revision) # type: ignore
230
- task_df["mteb_version"] = task_df["mteb_version"].map(parse_version) # type: ignore
231
- task_df = (
232
- task_df.groupby(["model", "task_name"])
233
- .apply(keep_best)
234
- .reset_index(drop=True)
218
+
219
+ # Use cached model metas
220
+ model_to_main_revision = _get_cached_model_metas()
221
+ task_df["main_revision"] = task_df["model"].map(model_to_main_revision)
222
+
223
+ # Use cached version parsing
224
+ task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached)
225
+
226
+ # Filter out rows without scores first
227
+ task_df = task_df[task_df["has_scores"]]
228
+
229
+ # Optimize groupby with vectorized operations
230
+ # Sort by priority: main_revision match, then mteb_version (descending), then revision
231
+ task_df["is_main_revision"] = task_df["revision"] == task_df["main_revision"]
232
+
233
+ # Handle None/NA/external revisions
234
+ task_df["revision_clean"] = task_df["revision"].copy()
235
+ task_df.loc[task_df["revision"].isna(), "revision_clean"] = (
236
+ "no_revision_available"
237
+ )
238
+ task_df.loc[task_df["revision"] == "external", "revision_clean"] = (
239
+ "no_revision_available"
240
+ )
241
+
242
+ # Create a priority column for sorting
243
+ # Higher priority = better to keep
244
+ # Priority: main_revision (1000), has valid mteb_version (100), has valid revision (10)
245
+ task_df["priority"] = 0
246
+ task_df.loc[task_df["is_main_revision"], "priority"] += 1000
247
+ task_df.loc[task_df["mteb_version"].notna(), "priority"] += 100
248
+ task_df.loc[
249
+ task_df["revision_clean"] != "no_revision_available", "priority"
250
+ ] += 10
251
+
252
+ # Sort by priority (desc), mteb_version (desc), and take first per group
253
+ task_df = task_df.sort_values(
254
+ ["model", "task_name", "priority", "mteb_version"],
255
+ ascending=[True, True, False, False],
256
+ na_position="last",
235
257
  )
258
+
259
+ task_df = task_df.groupby(["model", "task_name"], as_index=False).first()
260
+
261
+ # Reconstruct model results
236
262
  model_results = []
263
+ # Group by original revision to maintain deterministic behavior
264
+ # After the first() selection above, each (model, task_name) is unique,
265
+ # so grouping by original revision ensures consistent ModelResult creation
237
266
  for (model, model_revision), group in task_df.groupby(["model", "revision"]):
238
267
  model_result = ModelResult.model_construct(
239
- model_name=model,
240
- model_revision=model_revision,
268
+ model_name=model, # type: ignore[arg-type]
269
+ model_revision=model_revision, # type: ignore[arg-type]
241
270
  task_results=list(group["task_result"]),
242
271
  )
243
272
  model_results.append(model_result)
@@ -268,7 +297,7 @@ class BenchmarkResults(BaseModel):
268
297
  {
269
298
  "model": model_res.model_name,
270
299
  "revision": model_res.model_revision,
271
- **model_scores, # type: ignore
300
+ **model_scores,
272
301
  }
273
302
  )
274
303
  except Exception as e:
@@ -296,7 +325,7 @@ class BenchmarkResults(BaseModel):
296
325
 
297
326
  def to_dataframe(
298
327
  self,
299
- aggregation_level: Literal["subset", "split", "task"] = "task",
328
+ aggregation_level: Literal["subset", "split", "task", "language"] = "task",
300
329
  aggregation_fn: Callable[[list[Score]], Any] | None = None,
301
330
  include_model_revision: bool = False,
302
331
  format: Literal["wide", "long"] = "wide",
@@ -321,6 +350,7 @@ class BenchmarkResults(BaseModel):
321
350
  - "subset"/None: No aggregation will be done. The DataFrame will have one row per model, task, split and subset.
322
351
  - "split": Aggregates the scores by split. The DataFrame will have one row per model, task and split.
323
352
  - "task": Aggregates the scores by task. The DataFrame will have one row per model and task.
353
+ - "language": Aggregates the scores by language. The DataFrame will have one row per model and language.
324
354
  aggregation_fn: The function to use for aggregation. If None, the mean will be used.
325
355
  include_model_revision: If True, the model revision will be included in the DataFrame. If False, it will be excluded.
326
356
  If there are multiple revisions for the same model, they will be joined using the `join_revisions` method.
@@ -340,7 +370,9 @@ class BenchmarkResults(BaseModel):
340
370
  scores_data.extend(model_result._get_score_for_table())
341
371
 
342
372
  if not scores_data:
343
- logger.warning("No scores data available. Returning empty DataFrame.")
373
+ msg = "No scores data available. Returning empty DataFrame."
374
+ logger.warning(msg)
375
+ warnings.warn(msg)
344
376
  return pd.DataFrame()
345
377
 
346
378
  # Create DataFrame
@@ -361,7 +393,24 @@ class BenchmarkResults(BaseModel):
361
393
  format=format,
362
394
  )
363
395
 
364
- def __iter__(self) -> Iterator[ModelResult]:
396
+ def get_benchmark_result(self) -> pd.DataFrame:
397
+ """Get aggregated scores for each model in the benchmark.
398
+
399
+ Uses the benchmark's summary table creation method to compute scores.
400
+
401
+ Returns:
402
+ A DataFrame with the aggregated benchmark scores for each model.
403
+ """
404
+ if self.benchmark is None:
405
+ raise ValueError(
406
+ "No benchmark associated with these results (self.benchmark is None). "
407
+ "To get benchmark results, load results with a Benchmark object. "
408
+ "`results = cache.load_results(tasks='MTEB(eng, v2)')`"
409
+ )
410
+
411
+ return self.benchmark._create_summary_table(self)
412
+
413
+ def __iter__(self) -> Iterator[ModelResult]: # type: ignore[override]
365
414
  return iter(self.model_results)
366
415
 
367
416
  def __getitem__(self, index: int) -> ModelResult:
@@ -383,11 +432,11 @@ class BenchmarkResults(BaseModel):
383
432
  out_file.write(self.model_dump_json(indent=2))
384
433
 
385
434
  @classmethod
386
- def from_validated(cls, **data) -> Self:
435
+ def from_validated(cls, **data: Any) -> BenchmarkResults:
387
436
  """Create BenchmarkResults from validated data.
388
437
 
389
438
  Args:
390
- data: Dictionary containing the data.
439
+ **data: Arbitrary keyword arguments containing the data.
391
440
 
392
441
  Returns:
393
442
  An instance of BenchmarkResults.