mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/_create_dataloaders.py +6 -3
  3. mteb/_evaluators/any_sts_evaluator.py +21 -12
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +1 -1
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
  7. mteb/_evaluators/pair_classification_evaluator.py +30 -38
  8. mteb/_evaluators/sklearn_evaluator.py +15 -28
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  10. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  12. mteb/abstasks/_data_filter/__init__.py +0 -0
  13. mteb/abstasks/_data_filter/filters.py +125 -0
  14. mteb/abstasks/_data_filter/task_pipelines.py +102 -0
  15. mteb/abstasks/_statistics_calculation.py +6 -2
  16. mteb/abstasks/classification.py +0 -2
  17. mteb/abstasks/clustering.py +1 -1
  18. mteb/abstasks/clustering_legacy.py +3 -0
  19. mteb/abstasks/multilabel_classification.py +10 -3
  20. mteb/abstasks/pair_classification.py +8 -1
  21. mteb/abstasks/sts.py +7 -0
  22. mteb/abstasks/task_metadata.py +1 -0
  23. mteb/benchmarks/_create_table.py +84 -37
  24. mteb/benchmarks/benchmark.py +74 -15
  25. mteb/benchmarks/benchmarks/__init__.py +8 -0
  26. mteb/benchmarks/benchmarks/benchmarks.py +259 -15
  27. mteb/benchmarks/get_benchmark.py +2 -0
  28. mteb/cache.py +47 -10
  29. mteb/deprecated_evaluator.py +8 -13
  30. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  31. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  32. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  33. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  34. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  35. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  36. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  37. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  38. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  39. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  40. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  41. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  42. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  43. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  44. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  45. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  46. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  47. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  48. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  49. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  50. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  51. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  53. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  54. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  55. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  56. mteb/evaluate.py +65 -45
  57. mteb/leaderboard/app.py +268 -133
  58. mteb/leaderboard/benchmark_selector.py +14 -5
  59. mteb/leaderboard/figures.py +13 -15
  60. mteb/leaderboard/table.py +82 -17
  61. mteb/models/__init__.py +4 -1
  62. mteb/models/abs_encoder.py +21 -17
  63. mteb/models/cache_wrappers/__init__.py +2 -1
  64. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
  65. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  66. mteb/models/get_model_meta.py +3 -114
  67. mteb/models/instruct_wrapper.py +5 -1
  68. mteb/models/model_implementations/align_models.py +7 -0
  69. mteb/models/model_implementations/amazon_models.py +1 -0
  70. mteb/models/model_implementations/andersborges.py +65 -0
  71. mteb/models/model_implementations/ara_models.py +8 -0
  72. mteb/models/model_implementations/arctic_models.py +8 -0
  73. mteb/models/model_implementations/b1ade_models.py +1 -0
  74. mteb/models/model_implementations/bedrock_models.py +4 -0
  75. mteb/models/model_implementations/bge_models.py +60 -0
  76. mteb/models/model_implementations/bica_model.py +35 -0
  77. mteb/models/model_implementations/blip2_models.py +11 -0
  78. mteb/models/model_implementations/blip_models.py +27 -0
  79. mteb/models/model_implementations/bm25.py +1 -0
  80. mteb/models/model_implementations/bmretriever_models.py +4 -0
  81. mteb/models/model_implementations/cadet_models.py +9 -0
  82. mteb/models/model_implementations/cde_models.py +14 -0
  83. mteb/models/model_implementations/clip_models.py +3 -0
  84. mteb/models/model_implementations/clips_models.py +100 -0
  85. mteb/models/model_implementations/codefuse_models.py +162 -0
  86. mteb/models/model_implementations/codesage_models.py +15 -0
  87. mteb/models/model_implementations/cohere_models.py +8 -1
  88. mteb/models/model_implementations/cohere_v.py +5 -0
  89. mteb/models/model_implementations/colpali_models.py +14 -6
  90. mteb/models/model_implementations/colqwen_models.py +271 -1
  91. mteb/models/model_implementations/colsmol_models.py +2 -0
  92. mteb/models/model_implementations/conan_models.py +1 -0
  93. mteb/models/model_implementations/dino_models.py +171 -0
  94. mteb/models/model_implementations/e5_instruct.py +4 -0
  95. mteb/models/model_implementations/e5_models.py +12 -101
  96. mteb/models/model_implementations/e5_v.py +1 -0
  97. mteb/models/model_implementations/eagerworks_models.py +164 -0
  98. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  99. mteb/models/model_implementations/en_code_retriever.py +1 -0
  100. mteb/models/model_implementations/euler_models.py +32 -0
  101. mteb/models/model_implementations/evaclip_models.py +4 -0
  102. mteb/models/model_implementations/fa_models.py +58 -0
  103. mteb/models/model_implementations/facebookai.py +193 -0
  104. mteb/models/model_implementations/geogpt_models.py +1 -0
  105. mteb/models/model_implementations/gme_v_models.py +11 -5
  106. mteb/models/model_implementations/google_models.py +16 -5
  107. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
  108. mteb/models/model_implementations/gritlm_models.py +2 -0
  109. mteb/models/model_implementations/gte_models.py +78 -0
  110. mteb/models/model_implementations/hinvec_models.py +1 -0
  111. mteb/models/model_implementations/human.py +1 -0
  112. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  113. mteb/models/model_implementations/inf_models.py +2 -0
  114. mteb/models/model_implementations/jasper_models.py +255 -2
  115. mteb/models/model_implementations/jina_clip.py +1 -0
  116. mteb/models/model_implementations/jina_models.py +209 -5
  117. mteb/models/model_implementations/kalm_models.py +203 -25
  118. mteb/models/model_implementations/kblab.py +31 -0
  119. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  120. mteb/models/model_implementations/kfst.py +25 -0
  121. mteb/models/model_implementations/kowshik24_models.py +32 -0
  122. mteb/models/model_implementations/lens_models.py +2 -0
  123. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  124. mteb/models/model_implementations/linq_models.py +3 -2
  125. mteb/models/model_implementations/listconranker.py +1 -1
  126. mteb/models/model_implementations/llm2clip_models.py +3 -0
  127. mteb/models/model_implementations/llm2vec_models.py +8 -0
  128. mteb/models/model_implementations/mcinext_models.py +3 -0
  129. mteb/models/model_implementations/mdbr_models.py +2 -0
  130. mteb/models/model_implementations/misc_models.py +362 -0
  131. mteb/models/model_implementations/mme5_models.py +1 -0
  132. mteb/models/model_implementations/moco_models.py +11 -0
  133. mteb/models/model_implementations/mod_models.py +191 -0
  134. mteb/models/model_implementations/model2vec_models.py +13 -0
  135. mteb/models/model_implementations/moka_models.py +3 -0
  136. mteb/models/model_implementations/mxbai_models.py +9 -0
  137. mteb/models/model_implementations/nbailab.py +70 -0
  138. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  139. mteb/models/model_implementations/nomic_models.py +156 -4
  140. mteb/models/model_implementations/nomic_models_vision.py +7 -2
  141. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
  142. mteb/models/model_implementations/nvidia_models.py +4 -1
  143. mteb/models/model_implementations/octen_models.py +195 -0
  144. mteb/models/model_implementations/openai_models.py +20 -16
  145. mteb/models/model_implementations/openclip_models.py +24 -0
  146. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  147. mteb/models/model_implementations/ops_moa_models.py +4 -2
  148. mteb/models/model_implementations/pawan_models.py +39 -0
  149. mteb/models/model_implementations/piccolo_models.py +8 -0
  150. mteb/models/model_implementations/promptriever_models.py +8 -4
  151. mteb/models/model_implementations/pylate_models.py +37 -4
  152. mteb/models/model_implementations/qodo_models.py +2 -0
  153. mteb/models/model_implementations/qtack_models.py +1 -0
  154. mteb/models/model_implementations/qwen3_models.py +6 -3
  155. mteb/models/model_implementations/qzhou_models.py +3 -1
  156. mteb/models/model_implementations/random_baseline.py +16 -21
  157. mteb/models/model_implementations/rasgaard_models.py +34 -0
  158. mteb/models/model_implementations/reasonir_model.py +1 -0
  159. mteb/models/model_implementations/repllama_models.py +2 -0
  160. mteb/models/model_implementations/rerankers_custom.py +3 -3
  161. mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
  162. mteb/models/model_implementations/richinfoai_models.py +1 -0
  163. mteb/models/model_implementations/ru_sentence_models.py +51 -0
  164. mteb/models/model_implementations/ruri_models.py +322 -0
  165. mteb/models/model_implementations/salesforce_models.py +3 -0
  166. mteb/models/model_implementations/samilpwc_models.py +1 -0
  167. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  168. mteb/models/model_implementations/searchmap_models.py +1 -0
  169. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  170. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
  171. mteb/models/model_implementations/seed_models.py +1 -0
  172. mteb/models/model_implementations/sentence_transformers_models.py +57 -0
  173. mteb/models/model_implementations/shuu_model.py +32 -31
  174. mteb/models/model_implementations/siglip_models.py +10 -0
  175. mteb/models/model_implementations/sonar_models.py +1 -0
  176. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  177. mteb/models/model_implementations/stella_models.py +6 -0
  178. mteb/models/model_implementations/tarka_models.py +376 -0
  179. mteb/models/model_implementations/ua_sentence_models.py +10 -0
  180. mteb/models/model_implementations/uae_models.py +1 -0
  181. mteb/models/model_implementations/vdr_models.py +2 -0
  182. mteb/models/model_implementations/vi_vn_models.py +39 -0
  183. mteb/models/model_implementations/vista_models.py +2 -0
  184. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  185. mteb/models/model_implementations/voyage_models.py +15 -0
  186. mteb/models/model_implementations/voyage_v.py +8 -2
  187. mteb/models/model_implementations/xyz_models.py +1 -0
  188. mteb/models/model_implementations/youtu_models.py +1 -0
  189. mteb/models/model_implementations/yuan_models.py +34 -0
  190. mteb/models/model_implementations/yuan_models_en.py +58 -0
  191. mteb/models/model_meta.py +442 -22
  192. mteb/models/search_encoder_index/__init__.py +7 -0
  193. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  194. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  195. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  196. mteb/models/search_wrappers.py +165 -48
  197. mteb/models/sentence_transformer_wrapper.py +2 -7
  198. mteb/results/benchmark_results.py +88 -47
  199. mteb/results/model_result.py +11 -4
  200. mteb/results/task_result.py +37 -19
  201. mteb/similarity_functions.py +49 -0
  202. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  203. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  204. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  205. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  206. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  207. mteb/tasks/classification/ara/ajgt.py +1 -2
  208. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  209. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  210. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  211. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  212. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  213. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  214. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  215. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  216. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  217. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  218. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  219. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  220. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  221. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  222. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  223. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  224. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  225. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  226. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  227. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  228. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  229. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  230. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  231. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  232. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  233. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  234. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  235. mteb/tasks/classification/eng/news_classification.py +1 -2
  236. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  237. mteb/tasks/classification/eng/patent_classification.py +1 -2
  238. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  239. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  240. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  241. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  242. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  243. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  244. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  245. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  246. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  247. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  248. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  249. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  250. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  251. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  252. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  253. mteb/tasks/classification/est/estonian_valence.py +1 -2
  254. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  255. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  257. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  260. mteb/tasks/classification/heb/__init__.py +6 -1
  261. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  262. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  263. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  264. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  265. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  266. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  267. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  268. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  269. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  270. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  271. mteb/tasks/classification/kor/klue_tc.py +1 -2
  272. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  274. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  275. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  276. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  277. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  278. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  279. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  280. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  281. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  282. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  283. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  284. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  285. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  286. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  287. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  288. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  289. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  290. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  291. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  292. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  293. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  294. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  295. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  296. mteb/tasks/classification/pol/polish_classification.py +3 -6
  297. mteb/tasks/classification/ron/moroco.py +1 -2
  298. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  299. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  300. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  301. mteb/tasks/classification/rus/headline_classification.py +1 -2
  302. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  303. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  304. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  305. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  306. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  307. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  308. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  309. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  310. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  311. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  312. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  313. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  314. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  315. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  316. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  317. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  318. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  319. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  320. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  321. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  322. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  323. mteb/tasks/classification/tur/__init__.py +4 -0
  324. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  325. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  326. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  327. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  328. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  329. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  330. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  331. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  332. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  333. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  334. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  335. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  336. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  337. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  338. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  339. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  340. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  341. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  342. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  343. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  344. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  345. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  346. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  347. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  348. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  349. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  350. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  351. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  352. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  353. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  354. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  355. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  356. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  357. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  358. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  359. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  360. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  361. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  362. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  363. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  364. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  365. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  366. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  367. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  368. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  369. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  370. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  371. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  372. mteb/tasks/pair_classification/rus/terra.py +51 -25
  373. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  374. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  375. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  376. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  377. mteb/tasks/reranking/jpn/__init__.py +9 -1
  378. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  379. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  380. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  381. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  382. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  383. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  384. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  385. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  386. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  387. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  388. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  389. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  390. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  391. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  392. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  393. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  394. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  395. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  396. mteb/tasks/retrieval/kor/__init__.py +2 -1
  397. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  398. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  399. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  400. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  401. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  402. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  403. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  404. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  405. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  406. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  407. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  408. mteb/tasks/retrieval/nld/__init__.py +8 -4
  409. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  410. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  411. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  412. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  413. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  414. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  415. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  416. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  417. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  418. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  419. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  420. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  421. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  422. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  423. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  424. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  425. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  426. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  427. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  428. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  429. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  430. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  431. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  432. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  433. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  434. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  435. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  436. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  437. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  438. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  439. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  440. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  441. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  442. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  443. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  444. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  445. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  446. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  447. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  448. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  449. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  450. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  451. mteb/types/_encoder_io.py +7 -2
  452. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
  453. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
  454. mteb/models/model_implementations/nb_sbert.py +0 -25
  455. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
  456. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
  457. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
  458. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
@@ -117,6 +117,7 @@ CLIP_CITATION = """
117
117
  clip_vit_large_patch14 = ModelMeta(
118
118
  loader=CLIPModel, # type: ignore
119
119
  name="openai/clip-vit-large-patch14",
120
+ model_type=["dense"],
120
121
  languages=["eng-Latn"],
121
122
  revision="32bd64288804d66eefd0ccbe215aa642df71cc41",
122
123
  release_date="2021-02-26",
@@ -140,6 +141,7 @@ clip_vit_large_patch14 = ModelMeta(
140
141
  clip_vit_base_patch32 = ModelMeta(
141
142
  loader=CLIPModel, # type: ignore
142
143
  name="openai/clip-vit-base-patch32",
144
+ model_type=["dense"],
143
145
  languages=["eng-Latn"],
144
146
  revision="3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268",
145
147
  release_date="2021-02-26",
@@ -163,6 +165,7 @@ clip_vit_base_patch32 = ModelMeta(
163
165
  clip_vit_base_patch16 = ModelMeta(
164
166
  loader=CLIPModel, # type: ignore
165
167
  name="openai/clip-vit-base-patch16",
168
+ model_type=["dense"],
166
169
  languages=["eng-Latn"],
167
170
  revision="57c216476eefef5ab752ec549e440a49ae4ae5f3",
168
171
  release_date="2021-02-26",
@@ -0,0 +1,100 @@
1
+ from mteb.models.model_meta import (
2
+ ModelMeta,
3
+ ScoringFunction,
4
+ )
5
+ from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
6
+
7
+ from .e5_models import ME5_TRAINING_DATA, model_prompts
8
+
9
+ E5_NL_CITATION = """
10
+ @misc{banar2025mtebnle5nlembeddingbenchmark,
11
+ archiveprefix = {arXiv},
12
+ author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
13
+ eprint = {2509.12340},
14
+ primaryclass = {cs.CL},
15
+ title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
16
+ url = {https://arxiv.org/abs/2509.12340},
17
+ year = {2025},
18
+ }
19
+ """
20
+
21
+ e5_nl_small = ModelMeta(
22
+ loader=sentence_transformers_loader,
23
+ loader_kwargs=dict(
24
+ model_prompts=model_prompts,
25
+ ),
26
+ name="clips/e5-small-trm-nl",
27
+ model_type=["dense"],
28
+ languages=["nld-Latn"],
29
+ open_weights=True,
30
+ revision="0243664a6c5e12eef854b091eb283e51833c3e9f",
31
+ release_date="2025-09-23",
32
+ n_parameters=40_800_000,
33
+ memory_usage_mb=78,
34
+ embed_dim=384,
35
+ license="mit",
36
+ max_tokens=512,
37
+ reference="https://huggingface.co/clips/e5-small-trm-nl",
38
+ similarity_fn_name=ScoringFunction.COSINE,
39
+ framework=["Sentence Transformers", "PyTorch"],
40
+ use_instructions=True,
41
+ public_training_code="https://github.com/ELotfi/e5-nl",
42
+ public_training_data="https://huggingface.co/collections/clips/beir-nl",
43
+ training_datasets=ME5_TRAINING_DATA, # mMARCO-NL, HotpotQA-NL, FEVER-NL, and LLM generated data
44
+ adapted_from="intfloat/multilingual-e5-small",
45
+ citation=E5_NL_CITATION,
46
+ )
47
+
48
+ e5_nl_base = ModelMeta(
49
+ loader=sentence_transformers_loader,
50
+ loader_kwargs=dict(
51
+ model_prompts=model_prompts,
52
+ ),
53
+ name="clips/e5-base-trm-nl",
54
+ model_type=["dense"],
55
+ languages=["nld-Latn"],
56
+ open_weights=True,
57
+ revision="6bd5722f236da48b4b8bcb28cc1fc478f7089956",
58
+ release_date="2025-09-23",
59
+ n_parameters=124_400_000,
60
+ memory_usage_mb=237,
61
+ embed_dim=768,
62
+ license="mit",
63
+ max_tokens=514,
64
+ reference="https://huggingface.co/clips/e5-base-trm-nl",
65
+ similarity_fn_name=ScoringFunction.COSINE,
66
+ framework=["Sentence Transformers", "PyTorch"],
67
+ use_instructions=True,
68
+ public_training_code="https://github.com/ELotfi/e5-nl",
69
+ public_training_data="https://huggingface.co/collections/clips/beir-nl",
70
+ adapted_from="intfloat/multilingual-e5-base",
71
+ training_datasets=ME5_TRAINING_DATA, # mMARCO-NL, HotpotQA-NL, FEVER-NL, and LLM generated data
72
+ citation=E5_NL_CITATION,
73
+ )
74
+
75
+ e5_nl_large = ModelMeta(
76
+ loader=sentence_transformers_loader,
77
+ loader_kwargs=dict(
78
+ model_prompts=model_prompts,
79
+ ),
80
+ name="clips/e5-large-trm-nl",
81
+ model_type=["dense"],
82
+ languages=["nld-Latn"],
83
+ open_weights=True,
84
+ revision="683333f86ed9eb3699b5567f0fdabeb958d412b0",
85
+ release_date="2025-09-23",
86
+ n_parameters=355_000_000,
87
+ memory_usage_mb=1355,
88
+ embed_dim=1024,
89
+ license="mit",
90
+ max_tokens=514,
91
+ reference="https://huggingface.co/clips/e5-large-trm-nl",
92
+ similarity_fn_name=ScoringFunction.COSINE,
93
+ framework=["Sentence Transformers", "PyTorch"],
94
+ use_instructions=True,
95
+ public_training_code="https://github.com/ELotfi/e5-nl",
96
+ public_training_data="https://huggingface.co/collections/clips/beir-nl",
97
+ training_datasets=ME5_TRAINING_DATA, # mMARCO-NL, HotpotQA-NL, FEVER-NL, and LLM generated data
98
+ adapted_from="intfloat/multilingual-e5-large",
99
+ citation=E5_NL_CITATION,
100
+ )
@@ -1,7 +1,20 @@
1
1
  from mteb.models import ModelMeta
2
2
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
3
+ from mteb.models.model_meta import ScoringFunction
3
4
  from mteb.types import PromptType
4
5
 
6
+ F2LLM_CITATION = """@article{2025F2LLM,
7
+ title={F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data},
8
+ author={Ziyin Zhang and Zihan Liao and Hang Yu and Peng Di and Rui Wang},
9
+ journal={CoRR},
10
+ volume={abs/2510.02294},
11
+ year={2025},
12
+ url={https://doi.org/10.48550/arXiv.2510.02294},
13
+ doi={10.48550/ARXIV.2510.02294},
14
+ eprinttype={arXiv},
15
+ eprint={2510.02294}
16
+ }"""
17
+
5
18
  training_datasets = {
6
19
  "MSMARCO",
7
20
  "ArguAna",
@@ -62,6 +75,22 @@ training_datasets = {
62
75
  "TwentyNewsgroupsClustering",
63
76
  }
64
77
 
78
+ c2llm_training_datasets = {
79
+ "CodeSearchNet",
80
+ "CodeSearchNetRetrieval",
81
+ "CodeSearchNetCCRetrieval",
82
+ "CodeEditSearchRetrieval",
83
+ "CodeFeedbackMT",
84
+ "CodeFeedbackST",
85
+ "CodeTransOceanContest",
86
+ "CodeTransOceanDL",
87
+ "COIRCodeSearchNetRetrieval",
88
+ "CosQA",
89
+ "StackOverflowQA",
90
+ "SyntheticText2SQL",
91
+ "AdvTrain",
92
+ }
93
+
65
94
  prompts_dict = {
66
95
  "AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not counterfactual.",
67
96
  "Banking77Classification": "Given an online banking query, find the corresponding intents.",
@@ -107,6 +136,77 @@ prompts_dict = {
107
136
  }
108
137
 
109
138
 
139
+ c2llm_prompts_dict = {
140
+ "CodeEditSearchRetrieval": {
141
+ "query": "Retrieve the diff code that relevant the following query:\n",
142
+ "document": "Retrieved Answer:",
143
+ },
144
+ "CodeSearchNetRetrieval": {
145
+ "query": "Retrieve the code that solves the following query:\n",
146
+ "document": "Retrieved Answer:",
147
+ },
148
+ "AppsRetrieval": {
149
+ "query": "Given a problem description from a programming contest, retrieve code examples that can assist in solving it.\n",
150
+ "document": "Retrieved Answer:",
151
+ },
152
+ "CodeFeedbackMT": {
153
+ "query": "Given a multi-turn conversation history that includes both text and code, retrieve relevant multi-modal answers composed of text and code that address the ongoing discussion.\n",
154
+ "document": "Retrieved Answer:",
155
+ },
156
+ "CodeFeedbackST": {
157
+ "query": "Given a single-turn question composed of text and code, retrieve suitable answers that also mix text and code to provide helpful feedback.\n",
158
+ "document": "Retrieved Answer:",
159
+ },
160
+ "CodeSearchNetCCRetrieval": {
161
+ "query": "Given an initial code segment, retrieve the subsequent segment that continues the code.\n",
162
+ "document": "Retrieved Answer:",
163
+ },
164
+ "CodeTransOceanContest": {
165
+ "query": "Given a Python code snippet, retrieve its semantically equivalent version written in C++.\n",
166
+ "document": "Retrieved Answer:",
167
+ },
168
+ "CodeTransOceanDL": {
169
+ "query": "Given a Python code snippet, retrieve its semantically equivalent version written in C++.\n",
170
+ "document": "Retrieved Answer:",
171
+ },
172
+ "COIRCodeSearchNetRetrieval": {
173
+ "query": "Given a code snippet, retrieve its corresponding document string that summarizes its functionality.\n",
174
+ "document": "Retrieved Answer:",
175
+ },
176
+ "CosQA": {
177
+ "query": "Given a query from a web search, retrieve code that is helpful in addressing the query.\n",
178
+ "document": "Retrieved Answer:",
179
+ },
180
+ "StackOverflowQA": {
181
+ "query": "Given a question combining text and code, retrieve relevant answers that also contain both text and code snippets and can address the question.\n",
182
+ "document": "Retrieved Answer:",
183
+ },
184
+ "SyntheticText2SQL": {
185
+ "query": "Given a natural language question, retrieve SQL queries that serve as appropriate responses.\n",
186
+ "document": "Retrieved Answer:",
187
+ },
188
+ }
189
+
190
+ c2llm_languages = [
191
+ "eng-Latn",
192
+ "zho-Hans",
193
+ "python-Code",
194
+ "javascript-Code",
195
+ "go-Code",
196
+ "ruby-Code",
197
+ "java-Code",
198
+ "php-Code",
199
+ ]
200
+
201
+ c2llm_loader_kwargs = dict(
202
+ trust_remote_code=True,
203
+ prompts_dict=c2llm_prompts_dict,
204
+ apply_instruction_to_passages=True,
205
+ max_seq_length=2048,
206
+ padding_side="left",
207
+ )
208
+
209
+
110
210
  def instruction_template(
111
211
  instruction: str, prompt_type: PromptType | None = None
112
212
  ) -> str:
@@ -130,6 +230,7 @@ F2LLM_0B6 = ModelMeta(
130
230
  max_seq_length=8192,
131
231
  ),
132
232
  name="codefuse-ai/F2LLM-0.6B",
233
+ model_type=["dense"],
133
234
  languages=["eng-Latn"],
134
235
  open_weights=True,
135
236
  revision="36416618b83d4bd84a8ca30c2ee01ed518f9f2e7",
@@ -146,6 +247,7 @@ F2LLM_0B6 = ModelMeta(
146
247
  public_training_code="https://github.com/codefuse-ai/F2LLM",
147
248
  public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
148
249
  training_datasets=training_datasets,
250
+ citation=F2LLM_CITATION,
149
251
  )
150
252
 
151
253
  F2LLM_1B7 = ModelMeta(
@@ -158,6 +260,7 @@ F2LLM_1B7 = ModelMeta(
158
260
  max_seq_length=8192,
159
261
  ),
160
262
  name="codefuse-ai/F2LLM-1.7B",
263
+ model_type=["dense"],
161
264
  languages=["eng-Latn"],
162
265
  open_weights=True,
163
266
  revision="fdce0e09655f42cea26f7f66f5a70cd4507ea45c",
@@ -174,6 +277,7 @@ F2LLM_1B7 = ModelMeta(
174
277
  public_training_code="https://github.com/codefuse-ai/F2LLM",
175
278
  public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
176
279
  training_datasets=training_datasets,
280
+ citation=F2LLM_CITATION,
177
281
  )
178
282
 
179
283
  F2LLM_4B = ModelMeta(
@@ -186,6 +290,7 @@ F2LLM_4B = ModelMeta(
186
290
  max_seq_length=8192,
187
291
  ),
188
292
  name="codefuse-ai/F2LLM-4B",
293
+ model_type=["dense"],
189
294
  languages=["eng-Latn"],
190
295
  open_weights=True,
191
296
  revision="9fe95901ed2b6b59dd7673d6e93c9d76766a1e25",
@@ -202,4 +307,61 @@ F2LLM_4B = ModelMeta(
202
307
  public_training_code="https://github.com/codefuse-ai/F2LLM",
203
308
  public_training_data="https://huggingface.co/datasets/codefuse-ai/F2LLM",
204
309
  training_datasets=training_datasets,
310
+ citation=F2LLM_CITATION,
311
+ )
312
+
313
+ C2LLM_0B5 = ModelMeta(
314
+ loader=InstructSentenceTransformerModel,
315
+ loader_kwargs=c2llm_loader_kwargs,
316
+ name="codefuse-ai/C2LLM-0.5B",
317
+ revision="f08c18be03de42c6e388948a1804d4b271a953a2",
318
+ release_date="2025-12-22",
319
+ languages=c2llm_languages,
320
+ n_parameters=497252096,
321
+ memory_usage_mb=948.0,
322
+ max_tokens=32768,
323
+ embed_dim=896,
324
+ license="apache-2.0",
325
+ open_weights=True,
326
+ public_training_code=None,
327
+ public_training_data=None,
328
+ framework=["PyTorch", "Sentence Transformers"],
329
+ reference="https://huggingface.co/codefuse-ai/C2LLM-0.5B",
330
+ similarity_fn_name=ScoringFunction.COSINE,
331
+ use_instructions=True,
332
+ training_datasets=c2llm_training_datasets,
333
+ adapted_from=None,
334
+ superseded_by=None,
335
+ modalities=["text"],
336
+ is_cross_encoder=None,
337
+ citation=None,
338
+ contacts=None,
339
+ )
340
+
341
+ C2LLM_7B = ModelMeta(
342
+ loader=InstructSentenceTransformerModel,
343
+ loader_kwargs=c2llm_loader_kwargs,
344
+ name="codefuse-ai/C2LLM-7B",
345
+ revision="c1dc16d6d64eb962c783bfb36a6d9c2f24a86dca",
346
+ release_date="2025-12-22",
347
+ languages=c2llm_languages,
348
+ n_parameters=7667028992,
349
+ memory_usage_mb=14624.0,
350
+ max_tokens=32768,
351
+ embed_dim=3584,
352
+ license="apache-2.0",
353
+ open_weights=True,
354
+ public_training_code=None,
355
+ public_training_data=None,
356
+ framework=["PyTorch", "Sentence Transformers"],
357
+ reference="https://huggingface.co/codefuse-ai/C2LLM-7B",
358
+ similarity_fn_name=ScoringFunction.COSINE,
359
+ use_instructions=True,
360
+ training_datasets=c2llm_training_datasets,
361
+ adapted_from=None,
362
+ superseded_by=None,
363
+ modalities=["text"],
364
+ is_cross_encoder=None,
365
+ citation=None,
366
+ contacts=None,
205
367
  )
@@ -1,6 +1,15 @@
1
1
  from mteb.models.model_meta import ModelMeta, ScoringFunction
2
2
  from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
3
3
 
4
+ CODESAGE_CITATION = """@inproceedings{
5
+ zhang2024code,
6
+ title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
7
+ author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
8
+ booktitle={The Twelfth International Conference on Learning Representations},
9
+ year={2024},
10
+ url={https://openreview.net/forum?id=vfzRRjumpX}
11
+ }"""
12
+
4
13
  codesage_languages = [
5
14
  "python-Code",
6
15
  "javascript-Code",
@@ -13,6 +22,7 @@ codesage_languages = [
13
22
  codesage_large = ModelMeta(
14
23
  loader=sentence_transformers_loader,
15
24
  name="codesage/codesage-large-v2",
25
+ model_type=["dense"],
16
26
  languages=codesage_languages,
17
27
  revision="6e5d6dc15db3e310c37c6dbac072409f95ffa5c5",
18
28
  release_date="2024-02-03",
@@ -33,11 +43,13 @@ codesage_large = ModelMeta(
33
43
  "CodeSearchNetRetrieval",
34
44
  "CodeSearchNetCCRetrieval",
35
45
  },
46
+ citation=CODESAGE_CITATION,
36
47
  )
37
48
 
38
49
  codesage_base = ModelMeta(
39
50
  loader=sentence_transformers_loader,
40
51
  name="codesage/codesage-base-v2",
52
+ model_type=["dense"],
41
53
  languages=codesage_languages,
42
54
  revision="92eac4f44c8674638f039f1b0d8280f2539cb4c7",
43
55
  release_date="2024-02-03",
@@ -58,11 +70,13 @@ codesage_base = ModelMeta(
58
70
  "CodeSearchNetRetrieval",
59
71
  "CodeSearchNetCCRetrieval",
60
72
  },
73
+ citation=CODESAGE_CITATION,
61
74
  )
62
75
 
63
76
  codesage_small = ModelMeta(
64
77
  loader=sentence_transformers_loader,
65
78
  name="codesage/codesage-small-v2",
79
+ model_type=["dense"],
66
80
  languages=codesage_languages,
67
81
  revision="4844c2f24b25e181aa43ca058cc73dd2622565c1",
68
82
  release_date="2024-02-03",
@@ -83,4 +97,5 @@ codesage_small = ModelMeta(
83
97
  "CodeSearchNetRetrieval",
84
98
  "CodeSearchNetCCRetrieval",
85
99
  },
100
+ citation=CODESAGE_CITATION,
86
101
  )
@@ -8,6 +8,7 @@ import torch
8
8
  from torch.utils.data import DataLoader
9
9
  from tqdm.auto import tqdm
10
10
 
11
+ from mteb._requires_package import requires_package
11
12
  from mteb.abstasks.task_metadata import TaskMetadata
12
13
  from mteb.models.abs_encoder import AbsEncoder
13
14
  from mteb.models.model_meta import ModelMeta, ScoringFunction
@@ -219,9 +220,11 @@ class CohereTextEmbeddingModel(AbsEncoder):
219
220
  output_dimension: int | None = None,
220
221
  **kwargs,
221
222
  ) -> None:
223
+ requires_package(self, "cohere", model_name, "pip install 'mteb[cohere]'")
224
+
222
225
  import cohere # type: ignore
223
226
 
224
- self.model_name = model_name.lstrip("Cohere/Cohere-")
227
+ self.model_name = model_name.removeprefix("Cohere/Cohere-")
225
228
  self.sep = sep
226
229
  self.model_prompts = self.validate_task_to_prompt_name(model_prompts)
227
230
  if embedding_type not in get_args(EmbeddingType):
@@ -377,6 +380,7 @@ cohere_mult_3 = ModelMeta(
377
380
  model_prompts=model_prompts,
378
381
  ),
379
382
  name="Cohere/Cohere-embed-multilingual-v3.0",
383
+ model_type=["dense"],
380
384
  languages=supported_languages,
381
385
  open_weights=False,
382
386
  revision="1",
@@ -401,6 +405,7 @@ cohere_eng_3 = ModelMeta(
401
405
  model_prompts=model_prompts,
402
406
  ),
403
407
  name="Cohere/Cohere-embed-english-v3.0",
408
+ model_type=["dense"],
404
409
  languages=["eng-Latn"],
405
410
  open_weights=False,
406
411
  reference="https://cohere.com/blog/introducing-embed-v3",
@@ -425,6 +430,7 @@ cohere_mult_light_3 = ModelMeta(
425
430
  model_prompts=model_prompts,
426
431
  ),
427
432
  name="Cohere/Cohere-embed-multilingual-light-v3.0",
433
+ model_type=["dense"],
428
434
  languages=supported_languages,
429
435
  open_weights=False,
430
436
  revision="1",
@@ -449,6 +455,7 @@ cohere_eng_light_3 = ModelMeta(
449
455
  model_prompts=model_prompts,
450
456
  ),
451
457
  name="Cohere/Cohere-embed-english-light-v3.0",
458
+ model_type=["dense"],
452
459
  languages=["eng-Latn"],
453
460
  open_weights=False,
454
461
  reference="https://cohere.com/blog/introducing-embed-v3",
@@ -381,6 +381,7 @@ cohere_mult_3 = ModelMeta(
381
381
  loader=cohere_v_loader, # type: ignore
382
382
  loader_kwargs={"model_name": "embed-multilingual-v3.0"},
383
383
  name="cohere/embed-multilingual-v3.0",
384
+ model_type=["dense"],
384
385
  languages=[], # Unknown, but support >100 languages
385
386
  revision="1",
386
387
  release_date="2024-10-24",
@@ -404,6 +405,7 @@ cohere_eng_3 = ModelMeta(
404
405
  loader=cohere_v_loader, # type: ignore
405
406
  loader_kwargs={"model_name": "embed-english-v3.0"},
406
407
  name="cohere/embed-english-v3.0",
408
+ model_type=["dense"],
407
409
  languages=["eng-Latn"],
408
410
  revision="1",
409
411
  release_date="2024-10-24",
@@ -426,6 +428,7 @@ cohere_eng_3 = ModelMeta(
426
428
  cohere_embed_v4_multimodal = ModelMeta(
427
429
  loader=cohere_v_loader,
428
430
  loader_kwargs=dict(model_name="embed-v4.0"),
431
+ model_type=["dense"],
429
432
  name="Cohere/Cohere-embed-v4.0",
430
433
  languages=all_languages,
431
434
  revision="1",
@@ -450,6 +453,7 @@ cohere_embed_v4_multimodal_binary = ModelMeta(
450
453
  loader=cohere_v_loader,
451
454
  loader_kwargs=dict(embedding_type="binary"),
452
455
  name="Cohere/Cohere-embed-v4.0 (output_dtype=binary)",
456
+ model_type=["dense"],
453
457
  languages=all_languages,
454
458
  revision="1",
455
459
  release_date="2024-12-01",
@@ -474,6 +478,7 @@ cohere_embed_v4_multimodal_int8 = ModelMeta(
474
478
  loader=cohere_v_loader,
475
479
  loader_kwargs=dict(embedding_type="int8"),
476
480
  name="Cohere/Cohere-embed-v4.0 (output_dtype=int8)",
481
+ model_type=["dense"],
477
482
  languages=all_languages,
478
483
  revision="1",
479
484
  release_date="2024-12-01",
@@ -1,8 +1,9 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import Any
4
+ from typing import TYPE_CHECKING, Any
3
5
 
4
6
  import torch
5
- from PIL import Image
6
7
  from torch.utils.data import DataLoader
7
8
  from tqdm.auto import tqdm
8
9
 
@@ -15,6 +16,9 @@ from mteb.models.abs_encoder import AbsEncoder
15
16
  from mteb.models.model_meta import ModelMeta, ScoringFunction
16
17
  from mteb.types import Array, BatchedInput, PromptType
17
18
 
19
+ if TYPE_CHECKING:
20
+ from PIL import Image
21
+
18
22
  logger = logging.getLogger(__name__)
19
23
 
20
24
 
@@ -89,6 +93,7 @@ class ColPaliEngineWrapper(AbsEncoder):
89
93
  **kwargs,
90
94
  ):
91
95
  import torchvision.transforms.functional as F
96
+ from PIL import Image
92
97
 
93
98
  all_embeds = []
94
99
 
@@ -196,10 +201,10 @@ COLPALI_CITATION = """
196
201
 
197
202
  COLPALI_TRAINING_DATA = {
198
203
  # from https://huggingface.co/datasets/vidore/colpali_train_set
199
- "DocVQA",
200
- "InfoVQA",
201
- "TATDQA",
202
- "arXivQA",
204
+ "VidoreDocVQARetrieval",
205
+ "VidoreInfoVQARetrieval",
206
+ "VidoreTatdqaRetrieval",
207
+ "VidoreArxivQARetrieval",
203
208
  }
204
209
 
205
210
  colpali_v1_1 = ModelMeta(
@@ -208,6 +213,7 @@ colpali_v1_1 = ModelMeta(
208
213
  torch_dtype=torch.float16,
209
214
  ),
210
215
  name="vidore/colpali-v1.1",
216
+ model_type=["late-interaction"],
211
217
  languages=["eng-Latn"],
212
218
  revision="a0f15e3bcf97110e7ac1bb4be4bcd30eeb31992a",
213
219
  release_date="2024-08-21",
@@ -234,6 +240,7 @@ colpali_v1_2 = ModelMeta(
234
240
  torch_dtype=torch.float16,
235
241
  ),
236
242
  name="vidore/colpali-v1.2",
243
+ model_type=["late-interaction"],
237
244
  languages=["eng-Latn"],
238
245
  revision="6b89bc63c16809af4d111bfe412e2ac6bc3c9451",
239
246
  release_date="2024-08-26",
@@ -260,6 +267,7 @@ colpali_v1_3 = ModelMeta(
260
267
  torch_dtype=torch.float16,
261
268
  ),
262
269
  name="vidore/colpali-v1.3",
270
+ model_type=["late-interaction"],
263
271
  languages=["eng-Latn"],
264
272
  revision="1b5c8929330df1a66de441a9b5409a878f0de5b0",
265
273
  release_date="2024-11-01",