mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ greennode_embedding_large_vn_v1_training_data = {
7
7
 
8
8
  greennode_embedding_large_vn_v1 = ModelMeta(
9
9
  name="GreenNode/GreenNode-Embedding-Large-VN-V1",
10
+ model_type=["dense"],
10
11
  revision="660def1f6e1c8ecdf39f6f9c95829e3cf0cef837",
11
12
  release_date="2024-04-11",
12
13
  languages=[
@@ -21,7 +22,7 @@ greennode_embedding_large_vn_v1 = ModelMeta(
21
22
  max_tokens=8194,
22
23
  reference="https://huggingface.co/GreenNode/GreenNode-Embedding-Large-VN-V1",
23
24
  similarity_fn_name="cosine",
24
- framework=["Sentence Transformers", "PyTorch"],
25
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
25
26
  use_instructions=False,
26
27
  public_training_code=None,
27
28
  public_training_data="https://huggingface.co/datasets/GreenNode/GreenNode-Table-Markdown-Retrieval-VN",
@@ -31,6 +32,7 @@ greennode_embedding_large_vn_v1 = ModelMeta(
31
32
 
32
33
  greennode_embedding_large_vn_mixed_v1 = ModelMeta(
33
34
  name="GreenNode/GreenNode-Embedding-Large-VN-Mixed-V1",
35
+ model_type=["dense"],
34
36
  revision="1d3dddb3862292dab4bd3eddf0664c0335ad5843",
35
37
  release_date="2024-04-11",
36
38
  languages=[
@@ -45,7 +47,7 @@ greennode_embedding_large_vn_mixed_v1 = ModelMeta(
45
47
  max_tokens=8194,
46
48
  reference="https://huggingface.co/GreenNode/GreenNode-Embedding-Large-VN-Mixed-V1",
47
49
  similarity_fn_name="cosine",
48
- framework=["Sentence Transformers", "PyTorch"],
50
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
49
51
  use_instructions=False,
50
52
  public_training_code=None,
51
53
  public_training_data="https://huggingface.co/datasets/GreenNode/GreenNode-Table-Markdown-Retrieval-VN",
@@ -55,6 +57,7 @@ greennode_embedding_large_vn_mixed_v1 = ModelMeta(
55
57
 
56
58
  aiteamvn_vietnamese_embeddings = ModelMeta(
57
59
  name="AITeamVN/Vietnamese_Embedding",
60
+ model_type=["dense"],
58
61
  revision="fcbbb905e6c3757d421aaa5db6fd7c53d038f6fb",
59
62
  release_date="2024-03-17",
60
63
  languages=[
@@ -69,16 +72,23 @@ aiteamvn_vietnamese_embeddings = ModelMeta(
69
72
  max_tokens=8194,
70
73
  reference="https://huggingface.co/AITeamVN/Vietnamese_Embedding",
71
74
  similarity_fn_name="cosine",
72
- framework=["Sentence Transformers", "PyTorch"],
75
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
73
76
  use_instructions=False,
74
77
  public_training_code=None,
75
78
  public_training_data=None,
76
79
  training_datasets=None,
77
80
  adapted_from="BAAI/bge-m3",
81
+ citation="""@misc{Vietnamese_Embedding,
82
+ title={Vietnamese_Embedding: Embedding model in Vietnamese language.},
83
+ author={Nguyen Nho Trung, Nguyen Nhat Quang, Nguyen Van Huy},
84
+ year={2025},
85
+ publisher={Huggingface},
86
+ }""",
78
87
  )
79
88
 
80
89
  hiieu_halong_embedding = ModelMeta(
81
90
  name="hiieu/halong_embedding",
91
+ model_type=["dense"],
82
92
  revision="b57776031035f70ed2030d2e35ecc533eb0f8f71",
83
93
  release_date="2024-07-06",
84
94
  languages=[
@@ -94,15 +104,22 @@ hiieu_halong_embedding = ModelMeta(
94
104
  max_tokens=514,
95
105
  reference="https://huggingface.co/hiieu/halong_embedding",
96
106
  similarity_fn_name="cosine",
97
- framework=["Sentence Transformers", "PyTorch"],
107
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
98
108
  public_training_code=None,
99
109
  public_training_data=None,
100
110
  training_datasets=None,
101
111
  adapted_from="intfloat/multilingual-e5-base",
112
+ citation="""@misc{HalongEmbedding,
113
+ title={HalongEmbedding: A Vietnamese Text Embedding},
114
+ author={Ngo Hieu},
115
+ year={2024},
116
+ publisher={Huggingface},
117
+ }""",
102
118
  )
103
119
 
104
120
  sup_simcse_vietnamese_phobert_base_ = ModelMeta(
105
121
  name="VoVanPhuc/sup-SimCSE-VietNamese-phobert-base",
122
+ model_type=["dense"],
106
123
  revision="608779b86741a8acd8c8d38132974ff04086b138",
107
124
  release_date="2021-05-26",
108
125
  languages=[
@@ -118,14 +135,29 @@ sup_simcse_vietnamese_phobert_base_ = ModelMeta(
118
135
  license="apache-2.0",
119
136
  public_training_code=None,
120
137
  public_training_data=None,
121
- framework=["PyTorch", "Sentence Transformers"],
138
+ framework=["PyTorch", "Sentence Transformers", "Transformers", "safetensors"],
122
139
  reference="https://huggingface.co/VoVanPhuc/sup-SimCSE-VietNamese-phobert-base",
123
140
  similarity_fn_name="cosine",
124
141
  training_datasets=None,
142
+ citation="""@article{gao2021simcse,
143
+ title={{SimCSE}: Simple Contrastive Learning of Sentence Embeddings},
144
+ author={Gao, Tianyu and Yao, Xingcheng and Chen, Danqi},
145
+ journal={arXiv preprint arXiv:2104.08821},
146
+ year={2021}
147
+ }
148
+
149
+ @inproceedings{phobert,
150
+ title = {{PhoBERT: Pre-trained language models for Vietnamese}},
151
+ author = {Dat Quoc Nguyen and Anh Tuan Nguyen},
152
+ booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2020},
153
+ year = {2020},
154
+ pages = {1037--1042}
155
+ }""",
125
156
  )
126
157
 
127
158
  bkai_foundation_models_vietnamese_bi_encoder = ModelMeta(
128
159
  name="bkai-foundation-models/vietnamese-bi-encoder",
160
+ model_type=["dense"],
129
161
  revision="84f9d9ada0d1a3c37557398b9ae9fcedcdf40be0",
130
162
  release_date="2023-09-09",
131
163
  languages=[
@@ -141,8 +173,15 @@ bkai_foundation_models_vietnamese_bi_encoder = ModelMeta(
141
173
  license="apache-2.0",
142
174
  public_training_code=None,
143
175
  public_training_data=None,
144
- framework=["PyTorch", "Sentence Transformers"],
176
+ framework=["PyTorch", "Sentence Transformers", "safetensors", "Transformers"],
145
177
  reference="https://huggingface.co/bkai-foundation-models/vietnamese-bi-encoder",
146
178
  similarity_fn_name="cosine",
147
179
  training_datasets=None,
180
+ citation="""
181
+ @article{duc2024towards,
182
+ title={Towards Comprehensive Vietnamese Retrieval-Augmented Generation and Large Language Models},
183
+ author={Nguyen Quang Duc, Le Hai Son, Nguyen Duc Nhan, Nguyen Dich Nhat Minh, Le Thanh Huong, Dinh Viet Sang},
184
+ journal={arXiv preprint arXiv:2403.01616},
185
+ year={2024}
186
+ }""",
148
187
  )
@@ -247,6 +247,7 @@ visualized_bge_base = ModelMeta(
247
247
  image_tokens_num=196,
248
248
  ),
249
249
  name="BAAI/bge-visualized-base",
250
+ model_type=["dense"],
250
251
  languages=["eng-Latn"],
251
252
  revision="98db10b10d22620010d06f11733346e1c98c34aa",
252
253
  release_date="2024-06-06",
@@ -274,6 +275,7 @@ visualized_bge_m3 = ModelMeta(
274
275
  image_tokens_num=256,
275
276
  ),
276
277
  name="BAAI/bge-visualized-m3",
278
+ model_type=["dense"],
277
279
  languages=["eng-Latn"],
278
280
  revision="98db10b10d22620010d06f11733346e1c98c34aa",
279
281
  release_date="2024-06-06",
@@ -41,7 +41,7 @@ class VLM2VecWrapper(AbsEncoder):
41
41
  model_name,
42
42
  "pip install flash-attn --no-build-isolation",
43
43
  ):
44
- import flash_attn # noqa
44
+ pass
45
45
 
46
46
  requires_package(self, "peft", model_name, "pip install 'mteb[peft]'")
47
47
  from peft import LoraConfig, PeftModel
@@ -269,6 +269,7 @@ vlm2vec_training_datasets = set(
269
269
  vlm2vec_lora = ModelMeta(
270
270
  loader=VLM2VecWrapper,
271
271
  name="TIGER-Lab/VLM2Vec-LoRA",
272
+ model_type=["dense"],
272
273
  languages=["eng-Latn"],
273
274
  revision="7403b6327958071c1e33c822c7453adadccc7298",
274
275
  release_date="2024-10-08",
@@ -281,7 +282,7 @@ vlm2vec_lora = ModelMeta(
281
282
  open_weights=True,
282
283
  public_training_code="https://github.com/TIGER-AI-Lab/VLM2Vec",
283
284
  public_training_data="https://huggingface.co/datasets/TIGER-Lab/MMEB-train",
284
- framework=["PyTorch"],
285
+ framework=["PyTorch", "Transformers"],
285
286
  reference="https://huggingface.co/TIGER-Lab/VLM2Vec-LoRA",
286
287
  similarity_fn_name=ScoringFunction.COSINE,
287
288
  use_instructions=True,
@@ -292,6 +293,7 @@ vlm2vec_lora = ModelMeta(
292
293
  vlm2vec_full = ModelMeta(
293
294
  loader=VLM2VecWrapper,
294
295
  name="TIGER-Lab/VLM2Vec-Full",
296
+ model_type=["dense"],
295
297
  languages=["eng-Latn"],
296
298
  revision="e9afa98002097ac2471827ba23ea1f2ddd229480",
297
299
  release_date="2024-10-08",
@@ -304,7 +306,7 @@ vlm2vec_full = ModelMeta(
304
306
  open_weights=True,
305
307
  public_training_code="https://github.com/TIGER-AI-Lab/VLM2Vec",
306
308
  public_training_data="https://huggingface.co/TIGER-Lab/VLM2Vec-Full",
307
- framework=["PyTorch"],
309
+ framework=["PyTorch", "Transformers", "safetensors"],
308
310
  reference="https://huggingface.co/TIGER-Lab/VLM2Vec-Full",
309
311
  similarity_fn_name=ScoringFunction.COSINE,
310
312
  use_instructions=True,
@@ -25,6 +25,9 @@ VOYAGE_DTYPE_TRANSLATION = {
25
25
 
26
26
  # Total token limits per model based on VoyageAI documentation
27
27
  VOYAGE_TOTAL_TOKEN_LIMITS = {
28
+ "voyage-4-large": 120_000,
29
+ "voyage-4": 320_000,
30
+ "voyage-4-lite": 1_000_000,
28
31
  "voyage-3.5-lite": 1_000_000,
29
32
  "voyage-3.5": 320_000,
30
33
  "voyage-2": 320_000,
@@ -206,8 +209,87 @@ model_prompts = {
206
209
  PromptType.document.value: "document",
207
210
  }
208
211
 
212
+ voyage_4 = ModelMeta(
213
+ name="voyageai/voyage-4",
214
+ model_type=["dense"],
215
+ revision="1",
216
+ release_date="2026-01-15",
217
+ languages=None, # supported languages not specified
218
+ loader=VoyageModel,
219
+ loader_kwargs=dict(
220
+ max_tokens=32000,
221
+ model_prompts=model_prompts,
222
+ ),
223
+ max_tokens=32000,
224
+ embed_dim=1024,
225
+ open_weights=False,
226
+ n_parameters=None,
227
+ memory_usage_mb=None,
228
+ license=None,
229
+ reference="https://blog.voyageai.com/2026/01/15/voyage-4/",
230
+ similarity_fn_name="cosine",
231
+ framework=["API"],
232
+ use_instructions=True,
233
+ training_datasets=VOYAGE_TRAINING_DATA,
234
+ public_training_code=None,
235
+ public_training_data=None,
236
+ )
237
+
238
+ voyage_4_lite = ModelMeta(
239
+ name="voyageai/voyage-4-lite",
240
+ model_type=["dense"],
241
+ revision="1",
242
+ release_date="2026-01-15",
243
+ languages=None, # supported languages not specified
244
+ loader=VoyageModel,
245
+ loader_kwargs=dict(
246
+ max_tokens=32000,
247
+ model_prompts=model_prompts,
248
+ ),
249
+ max_tokens=32000,
250
+ embed_dim=1024,
251
+ open_weights=False,
252
+ n_parameters=None,
253
+ memory_usage_mb=None,
254
+ license=None,
255
+ reference="https://blog.voyageai.com/2026/01/15/voyage-4/",
256
+ similarity_fn_name="cosine",
257
+ framework=["API"],
258
+ use_instructions=True,
259
+ training_datasets=VOYAGE_TRAINING_DATA,
260
+ public_training_code=None,
261
+ public_training_data=None,
262
+ )
263
+
264
+ voyage_4_large = ModelMeta(
265
+ name="voyageai/voyage-4-large",
266
+ model_type=["dense"],
267
+ revision="1",
268
+ release_date="2026-01-15",
269
+ languages=None, # supported languages not specified
270
+ loader=VoyageModel,
271
+ loader_kwargs=dict(
272
+ max_tokens=32000,
273
+ model_prompts=model_prompts,
274
+ ),
275
+ max_tokens=32000,
276
+ embed_dim=1024,
277
+ open_weights=False,
278
+ n_parameters=None,
279
+ memory_usage_mb=None,
280
+ license=None,
281
+ reference="https://blog.voyageai.com/2026/01/15/voyage-4/",
282
+ similarity_fn_name="cosine",
283
+ framework=["API"],
284
+ use_instructions=True,
285
+ training_datasets=VOYAGE_TRAINING_DATA,
286
+ public_training_code=None,
287
+ public_training_data=None,
288
+ )
289
+
209
290
  voyage_3_large = ModelMeta(
210
291
  name="voyageai/voyage-3-large", # Date of publication of this post https://blog.voyageai.com/2025/01/07/voyage-3-large/
292
+ model_type=["dense"],
211
293
  revision="1",
212
294
  release_date="2025-01-07",
213
295
  languages=None, # supported languages not specified
@@ -229,11 +311,13 @@ voyage_3_large = ModelMeta(
229
311
  training_datasets=VOYAGE_TRAINING_DATA,
230
312
  public_training_code=None,
231
313
  public_training_data=None,
314
+ superseded_by="voyageai/voyage-4-large",
232
315
  )
233
316
 
234
317
 
235
318
  voyage_3_5 = ModelMeta(
236
319
  name="voyageai/voyage-3.5",
320
+ model_type=["dense"],
237
321
  revision="1",
238
322
  release_date="2025-01-21",
239
323
  languages=None, # supported languages not specified
@@ -255,10 +339,12 @@ voyage_3_5 = ModelMeta(
255
339
  training_datasets=VOYAGE_TRAINING_DATA,
256
340
  public_training_code=None,
257
341
  public_training_data=None,
342
+ superseded_by="voyageai/voyage-4",
258
343
  )
259
344
 
260
345
  voyage_3_5_int8 = ModelMeta(
261
346
  name="voyageai/voyage-3.5 (output_dtype=int8)",
347
+ model_type=["dense"],
262
348
  revision="1",
263
349
  release_date="2025-01-21",
264
350
  languages=None, # supported languages not specified
@@ -285,6 +371,7 @@ voyage_3_5_int8 = ModelMeta(
285
371
 
286
372
  voyage_3_5_binary = ModelMeta(
287
373
  name="voyageai/voyage-3.5 (output_dtype=binary)",
374
+ model_type=["dense"],
288
375
  revision="1",
289
376
  release_date="2025-01-21",
290
377
  languages=None, # supported languages not specified
@@ -311,6 +398,7 @@ voyage_3_5_binary = ModelMeta(
311
398
 
312
399
  voyage_large_2_instruct = ModelMeta(
313
400
  name="voyageai/voyage-large-2-instruct",
401
+ model_type=["dense"],
314
402
  revision="1",
315
403
  release_date="2024-05-05",
316
404
  languages=None, # supported languages not specified
@@ -336,6 +424,7 @@ voyage_large_2_instruct = ModelMeta(
336
424
 
337
425
  voyage_finance_2 = ModelMeta(
338
426
  name="voyageai/voyage-finance-2",
427
+ model_type=["dense"],
339
428
  revision="1",
340
429
  release_date="2024-05-30",
341
430
  languages=None, # supported languages not specified
@@ -361,6 +450,7 @@ voyage_finance_2 = ModelMeta(
361
450
 
362
451
  voyage_law_2 = ModelMeta(
363
452
  name="voyageai/voyage-law-2",
453
+ model_type=["dense"],
364
454
  revision="1",
365
455
  release_date="2024-04-15",
366
456
  languages=None, # supported languages not specified
@@ -386,6 +476,7 @@ voyage_law_2 = ModelMeta(
386
476
 
387
477
  voyage_code_2 = ModelMeta(
388
478
  name="voyageai/voyage-code-2",
479
+ model_type=["dense"],
389
480
  revision="1",
390
481
  release_date="2024-01-23",
391
482
  languages=None, # supported languages not specified
@@ -411,6 +502,7 @@ voyage_code_2 = ModelMeta(
411
502
 
412
503
  voyage_code_3 = ModelMeta(
413
504
  name="voyageai/voyage-code-3",
505
+ model_type=["dense"],
414
506
  revision="1",
415
507
  release_date="2024-12-04",
416
508
  languages=None, # supported languages not specified
@@ -437,6 +529,7 @@ voyage_code_3 = ModelMeta(
437
529
 
438
530
  voyage_large_2 = ModelMeta(
439
531
  name="voyageai/voyage-large-2", # Date of publication of this post https://blog.voyageai.com/2023/10/29/voyage-embeddings/
532
+ model_type=["dense"],
440
533
  revision="1",
441
534
  release_date="2023-10-29",
442
535
  languages=None, # supported languages not specified
@@ -462,6 +555,7 @@ voyage_large_2 = ModelMeta(
462
555
 
463
556
  voyage_2 = ModelMeta(
464
557
  name="voyageai/voyage-2",
558
+ model_type=["dense"],
465
559
  revision="1",
466
560
  release_date="2023-10-29",
467
561
  languages=None, # supported languages not specified
@@ -486,6 +580,7 @@ voyage_2 = ModelMeta(
486
580
  )
487
581
  voyage_multilingual_2 = ModelMeta(
488
582
  name="voyageai/voyage-multilingual-2",
583
+ model_type=["dense"],
489
584
  revision="1",
490
585
  release_date="2024-06-10",
491
586
  languages=None, # supported languages not specified
@@ -511,6 +606,7 @@ voyage_multilingual_2 = ModelMeta(
511
606
 
512
607
  voyage_3 = ModelMeta(
513
608
  name="voyageai/voyage-3",
609
+ model_type=["dense"],
514
610
  revision="1",
515
611
  release_date="2024-09-18",
516
612
  languages=None, # supported languages not specified
@@ -536,6 +632,7 @@ voyage_3 = ModelMeta(
536
632
 
537
633
  voyage_3_lite = ModelMeta(
538
634
  name="voyageai/voyage-3-lite",
635
+ model_type=["dense"],
539
636
  revision="1",
540
637
  release_date="2024-09-18",
541
638
  languages=None, # supported languages not specified
@@ -557,10 +654,12 @@ voyage_3_lite = ModelMeta(
557
654
  training_datasets=VOYAGE_TRAINING_DATA,
558
655
  public_training_code=None,
559
656
  public_training_data=None,
657
+ superseded_by="voyageai/voyage-4-lite",
560
658
  )
561
659
 
562
660
  voyage_3_exp = ModelMeta(
563
661
  name="voyageai/voyage-3-m-exp",
662
+ model_type=["dense"],
564
663
  revision="1",
565
664
  release_date="2025-01-08",
566
665
  languages=["eng-Latn"],
@@ -1,8 +1,9 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any, Literal
4
+ from typing import TYPE_CHECKING, Any, Literal
3
5
 
4
6
  import torch
5
- from PIL import Image
6
7
  from torch.utils.data import DataLoader
7
8
  from tqdm.auto import tqdm
8
9
 
@@ -12,6 +13,11 @@ from mteb.models.abs_encoder import AbsEncoder
12
13
  from mteb.models.model_meta import ModelMeta, ScoringFunction
13
14
  from mteb.types import Array, BatchedInput, PromptType
14
15
 
16
+ if TYPE_CHECKING:
17
+ from PIL import Image
18
+
19
+ logger = logging.getLogger(__name__)
20
+
15
21
 
16
22
  def _downsample_image(
17
23
  image: Image.Image, max_pixels: int = 16000000, target_longest_side: int = 4000
@@ -33,18 +39,18 @@ def _downsample_image(
33
39
  new_width = int(width * (target_longest_side / height))
34
40
 
35
41
  new_size = (new_width, new_height)
36
- logging.info(
42
+ logger.info(
37
43
  f"Downsampling image from {width}x{height} to {new_width}x{new_height}"
38
44
  )
39
- return image.resize(new_size, Image.LANCZOS) # type: ignore
45
+ return image.resize(new_size, Image.LANCZOS)
40
46
  if width > height:
41
47
  if width > 10000:
42
- logging.error("Processing extremely wide images.")
43
- return image.resize((10000, height), Image.LANCZOS) # type: ignore
48
+ logger.error("Processing extremely wide images.")
49
+ return image.resize((10000, height), Image.LANCZOS)
44
50
  else:
45
51
  if height > 10000:
46
- logging.error("Processing extremely high images.")
47
- return image.resize((width, 10000), Image.LANCZOS) # type: ignore
52
+ logger.error("Processing extremely high images.")
53
+ return image.resize((width, 10000), Image.LANCZOS)
48
54
  return image
49
55
 
50
56
 
@@ -149,6 +155,7 @@ def voyage_v_loader(model_name, **kwargs):
149
155
  show_progress_bar: bool = True,
150
156
  **kwargs: Any,
151
157
  ) -> Array:
158
+ input_type = "document" # default
152
159
  if prompt_type is not None:
153
160
  if prompt_type == PromptType.document:
154
161
  input_type = "document"
@@ -197,8 +204,9 @@ def voyage_v_loader(model_name, **kwargs):
197
204
 
198
205
 
199
206
  voyage_v = ModelMeta(
200
- loader=voyage_v_loader, # type: ignore
207
+ loader=voyage_v_loader,
201
208
  name="voyageai/voyage-multimodal-3",
209
+ model_type=["dense"],
202
210
  languages=[], # Unknown
203
211
  revision="1",
204
212
  release_date="2024-11-10",
@@ -24,6 +24,7 @@ xyz_zh_datasets = {
24
24
 
25
25
  xyz_embedding = ModelMeta(
26
26
  name="fangxq/XYZ-embedding",
27
+ model_type=["dense"],
27
28
  languages=["zho-Hans"],
28
29
  loader=sentence_transformers_loader,
29
30
  open_weights=True,
@@ -115,6 +115,7 @@ Youtu_Embedding_V1 = ModelMeta(
115
115
  max_seq_length=8192,
116
116
  ),
117
117
  name="tencent/Youtu-Embedding",
118
+ model_type=["dense"],
118
119
  languages=["zho-Hans"],
119
120
  revision="32e04afc24817c187a8422e7bdbb493b19796d47",
120
121
  release_date="2025-09-28",
@@ -126,7 +127,7 @@ Youtu_Embedding_V1 = ModelMeta(
126
127
  max_tokens=8192,
127
128
  reference="https://huggingface.co/tencent/Youtu-Embedding",
128
129
  similarity_fn_name="cosine",
129
- framework=["Sentence Transformers", "PyTorch"],
130
+ framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
130
131
  use_instructions=True,
131
132
  public_training_code=None,
132
133
  public_training_data=None,
@@ -0,0 +1,34 @@
1
+ from mteb.models import ModelMeta, sentence_transformers_loader
2
+
3
+ yuan_emb_zh_datasets = {
4
+ "CMedQAv2-reranking",
5
+ "DuRetrieval",
6
+ "MMarcoReranking",
7
+ "T2Reranking",
8
+ "T2Retrieval",
9
+ }
10
+
11
+ # not in mteb
12
+ # "Multi-CPR":"http://github.com/Alibaba-NLP/Multi-CPR",
13
+
14
+ yuan_embedding_2_zh = ModelMeta(
15
+ name="IEITYuan/Yuan-embedding-2.0-zh",
16
+ model_type=["dense"],
17
+ loader=sentence_transformers_loader,
18
+ languages=["zho-Hans"],
19
+ open_weights=True,
20
+ revision="b5ebcace6f4fc6e5a4d1852557eb2dc2d1040cee",
21
+ release_date="2025-11-24",
22
+ n_parameters=326000000,
23
+ memory_usage_mb=1242,
24
+ embed_dim=1792,
25
+ license="apache-2.0",
26
+ max_tokens=512,
27
+ reference="https://huggingface.co/IEITYuan/Yuan-embedding-2.0-zh",
28
+ similarity_fn_name="cosine",
29
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
30
+ use_instructions=False,
31
+ public_training_code=None,
32
+ public_training_data=None,
33
+ training_datasets=yuan_emb_zh_datasets,
34
+ )
@@ -0,0 +1,58 @@
1
+ from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
2
+ from mteb.models.model_meta import ModelMeta
3
+ from mteb.models.models_protocols import PromptType
4
+
5
+
6
+ def instruction_template(
7
+ instruction: str, prompt_type: PromptType | None = None
8
+ ) -> str:
9
+ if not instruction or prompt_type == PromptType.document:
10
+ return ""
11
+ if isinstance(instruction, dict):
12
+ if prompt_type is None:
13
+ instruction = next(iter(instruction.values())) # TODO
14
+ else:
15
+ instruction = instruction[prompt_type]
16
+ return f"Instruct: {instruction}\nQuery:"
17
+
18
+
19
+ training_data = {
20
+ "T2Retrieval",
21
+ "DuRetrieval",
22
+ "MMarcoReranking",
23
+ "CMedQAv2-reranking",
24
+ "NQ",
25
+ "MSMARCO",
26
+ "HotpotQA",
27
+ "MrTidyRetrieval",
28
+ "MIRACLRetrieval",
29
+ "CodeSearchNet",
30
+ }
31
+
32
+
33
+ yuan_embedding_2_en = ModelMeta(
34
+ loader=InstructSentenceTransformerModel,
35
+ loader_kwargs=dict(
36
+ instruction_template=instruction_template,
37
+ apply_instruction_to_passages=False,
38
+ ),
39
+ name="IEITYuan/Yuan-embedding-2.0-en",
40
+ model_type=["dense"],
41
+ languages=["eng-Latn"],
42
+ open_weights=True,
43
+ revision="b2fd15da3bcae3473c8529593825c15068f09fce",
44
+ release_date="2025-11-27",
45
+ n_parameters=595776512,
46
+ memory_usage_mb=2272,
47
+ embed_dim=1024,
48
+ max_tokens=2048,
49
+ license="apache-2.0",
50
+ reference="https://huggingface.co/IEITYuan/Yuan-embedding-2.0-en",
51
+ similarity_fn_name="cosine",
52
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
53
+ use_instructions=True,
54
+ public_training_code=None,
55
+ public_training_data=None,
56
+ training_datasets=training_data,
57
+ adapted_from="Qwen/Qwen3-Embedding-0.6B",
58
+ )