mteb 2.1.4__py3-none-any.whl → 2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. mteb/__init__.py +4 -0
  2. mteb/_create_dataloaders.py +6 -3
  3. mteb/_evaluators/any_sts_evaluator.py +21 -12
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +1 -1
  6. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +9 -4
  7. mteb/_evaluators/pair_classification_evaluator.py +30 -38
  8. mteb/_evaluators/sklearn_evaluator.py +15 -28
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +4 -1
  10. mteb/_evaluators/text/summarization_evaluator.py +4 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +2 -2
  12. mteb/abstasks/_data_filter/__init__.py +0 -0
  13. mteb/abstasks/_data_filter/filters.py +125 -0
  14. mteb/abstasks/_data_filter/task_pipelines.py +102 -0
  15. mteb/abstasks/_statistics_calculation.py +6 -2
  16. mteb/abstasks/classification.py +0 -2
  17. mteb/abstasks/clustering.py +1 -1
  18. mteb/abstasks/clustering_legacy.py +3 -0
  19. mteb/abstasks/multilabel_classification.py +10 -3
  20. mteb/abstasks/pair_classification.py +8 -1
  21. mteb/abstasks/sts.py +7 -0
  22. mteb/abstasks/task_metadata.py +1 -0
  23. mteb/benchmarks/_create_table.py +84 -37
  24. mteb/benchmarks/benchmark.py +74 -15
  25. mteb/benchmarks/benchmarks/__init__.py +8 -0
  26. mteb/benchmarks/benchmarks/benchmarks.py +259 -15
  27. mteb/benchmarks/get_benchmark.py +2 -0
  28. mteb/cache.py +47 -10
  29. mteb/deprecated_evaluator.py +8 -13
  30. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  31. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  32. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  33. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  34. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  35. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  36. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  37. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  38. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  39. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  40. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  41. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  42. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  43. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  44. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  45. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  46. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  47. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  48. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  49. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  50. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  51. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  52. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  53. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  54. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  55. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  56. mteb/evaluate.py +65 -45
  57. mteb/leaderboard/app.py +268 -133
  58. mteb/leaderboard/benchmark_selector.py +14 -5
  59. mteb/leaderboard/figures.py +13 -15
  60. mteb/leaderboard/table.py +82 -17
  61. mteb/models/__init__.py +4 -1
  62. mteb/models/abs_encoder.py +21 -17
  63. mteb/models/cache_wrappers/__init__.py +2 -1
  64. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
  65. mteb/models/cache_wrappers/cache_wrapper.py +1 -1
  66. mteb/models/get_model_meta.py +3 -114
  67. mteb/models/instruct_wrapper.py +5 -1
  68. mteb/models/model_implementations/align_models.py +7 -0
  69. mteb/models/model_implementations/amazon_models.py +1 -0
  70. mteb/models/model_implementations/andersborges.py +65 -0
  71. mteb/models/model_implementations/ara_models.py +8 -0
  72. mteb/models/model_implementations/arctic_models.py +8 -0
  73. mteb/models/model_implementations/b1ade_models.py +1 -0
  74. mteb/models/model_implementations/bedrock_models.py +4 -0
  75. mteb/models/model_implementations/bge_models.py +60 -0
  76. mteb/models/model_implementations/bica_model.py +35 -0
  77. mteb/models/model_implementations/blip2_models.py +11 -0
  78. mteb/models/model_implementations/blip_models.py +27 -0
  79. mteb/models/model_implementations/bm25.py +1 -0
  80. mteb/models/model_implementations/bmretriever_models.py +4 -0
  81. mteb/models/model_implementations/cadet_models.py +9 -0
  82. mteb/models/model_implementations/cde_models.py +14 -0
  83. mteb/models/model_implementations/clip_models.py +3 -0
  84. mteb/models/model_implementations/clips_models.py +100 -0
  85. mteb/models/model_implementations/codefuse_models.py +162 -0
  86. mteb/models/model_implementations/codesage_models.py +15 -0
  87. mteb/models/model_implementations/cohere_models.py +8 -1
  88. mteb/models/model_implementations/cohere_v.py +5 -0
  89. mteb/models/model_implementations/colpali_models.py +14 -6
  90. mteb/models/model_implementations/colqwen_models.py +271 -1
  91. mteb/models/model_implementations/colsmol_models.py +2 -0
  92. mteb/models/model_implementations/conan_models.py +1 -0
  93. mteb/models/model_implementations/dino_models.py +171 -0
  94. mteb/models/model_implementations/e5_instruct.py +4 -0
  95. mteb/models/model_implementations/e5_models.py +12 -101
  96. mteb/models/model_implementations/e5_v.py +1 -0
  97. mteb/models/model_implementations/eagerworks_models.py +164 -0
  98. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  99. mteb/models/model_implementations/en_code_retriever.py +1 -0
  100. mteb/models/model_implementations/euler_models.py +32 -0
  101. mteb/models/model_implementations/evaclip_models.py +4 -0
  102. mteb/models/model_implementations/fa_models.py +58 -0
  103. mteb/models/model_implementations/facebookai.py +193 -0
  104. mteb/models/model_implementations/geogpt_models.py +1 -0
  105. mteb/models/model_implementations/gme_v_models.py +11 -5
  106. mteb/models/model_implementations/google_models.py +16 -5
  107. mteb/models/model_implementations/granite_vision_embedding_models.py +7 -2
  108. mteb/models/model_implementations/gritlm_models.py +2 -0
  109. mteb/models/model_implementations/gte_models.py +78 -0
  110. mteb/models/model_implementations/hinvec_models.py +1 -0
  111. mteb/models/model_implementations/human.py +1 -0
  112. mteb/models/model_implementations/ibm_granite_models.py +6 -0
  113. mteb/models/model_implementations/inf_models.py +2 -0
  114. mteb/models/model_implementations/jasper_models.py +255 -2
  115. mteb/models/model_implementations/jina_clip.py +1 -0
  116. mteb/models/model_implementations/jina_models.py +209 -5
  117. mteb/models/model_implementations/kalm_models.py +203 -25
  118. mteb/models/model_implementations/kblab.py +31 -0
  119. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  120. mteb/models/model_implementations/kfst.py +25 -0
  121. mteb/models/model_implementations/kowshik24_models.py +32 -0
  122. mteb/models/model_implementations/lens_models.py +2 -0
  123. mteb/models/model_implementations/lgai_embedding_models.py +1 -0
  124. mteb/models/model_implementations/linq_models.py +3 -2
  125. mteb/models/model_implementations/listconranker.py +1 -1
  126. mteb/models/model_implementations/llm2clip_models.py +3 -0
  127. mteb/models/model_implementations/llm2vec_models.py +8 -0
  128. mteb/models/model_implementations/mcinext_models.py +3 -0
  129. mteb/models/model_implementations/mdbr_models.py +2 -0
  130. mteb/models/model_implementations/misc_models.py +362 -0
  131. mteb/models/model_implementations/mme5_models.py +1 -0
  132. mteb/models/model_implementations/moco_models.py +11 -0
  133. mteb/models/model_implementations/mod_models.py +191 -0
  134. mteb/models/model_implementations/model2vec_models.py +13 -0
  135. mteb/models/model_implementations/moka_models.py +3 -0
  136. mteb/models/model_implementations/mxbai_models.py +9 -0
  137. mteb/models/model_implementations/nbailab.py +70 -0
  138. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -0
  139. mteb/models/model_implementations/nomic_models.py +156 -4
  140. mteb/models/model_implementations/nomic_models_vision.py +7 -2
  141. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +23 -16
  142. mteb/models/model_implementations/nvidia_models.py +4 -1
  143. mteb/models/model_implementations/octen_models.py +195 -0
  144. mteb/models/model_implementations/openai_models.py +20 -16
  145. mteb/models/model_implementations/openclip_models.py +24 -0
  146. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -0
  147. mteb/models/model_implementations/ops_moa_models.py +4 -2
  148. mteb/models/model_implementations/pawan_models.py +39 -0
  149. mteb/models/model_implementations/piccolo_models.py +8 -0
  150. mteb/models/model_implementations/promptriever_models.py +8 -4
  151. mteb/models/model_implementations/pylate_models.py +37 -4
  152. mteb/models/model_implementations/qodo_models.py +2 -0
  153. mteb/models/model_implementations/qtack_models.py +1 -0
  154. mteb/models/model_implementations/qwen3_models.py +6 -3
  155. mteb/models/model_implementations/qzhou_models.py +3 -1
  156. mteb/models/model_implementations/random_baseline.py +16 -21
  157. mteb/models/model_implementations/rasgaard_models.py +34 -0
  158. mteb/models/model_implementations/reasonir_model.py +1 -0
  159. mteb/models/model_implementations/repllama_models.py +2 -0
  160. mteb/models/model_implementations/rerankers_custom.py +3 -3
  161. mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
  162. mteb/models/model_implementations/richinfoai_models.py +1 -0
  163. mteb/models/model_implementations/ru_sentence_models.py +51 -0
  164. mteb/models/model_implementations/ruri_models.py +322 -0
  165. mteb/models/model_implementations/salesforce_models.py +3 -0
  166. mteb/models/model_implementations/samilpwc_models.py +1 -0
  167. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  168. mteb/models/model_implementations/searchmap_models.py +1 -0
  169. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  170. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +658 -0
  171. mteb/models/model_implementations/seed_models.py +1 -0
  172. mteb/models/model_implementations/sentence_transformers_models.py +57 -0
  173. mteb/models/model_implementations/shuu_model.py +32 -31
  174. mteb/models/model_implementations/siglip_models.py +10 -0
  175. mteb/models/model_implementations/sonar_models.py +1 -0
  176. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  177. mteb/models/model_implementations/stella_models.py +6 -0
  178. mteb/models/model_implementations/tarka_models.py +376 -0
  179. mteb/models/model_implementations/ua_sentence_models.py +10 -0
  180. mteb/models/model_implementations/uae_models.py +1 -0
  181. mteb/models/model_implementations/vdr_models.py +2 -0
  182. mteb/models/model_implementations/vi_vn_models.py +39 -0
  183. mteb/models/model_implementations/vista_models.py +2 -0
  184. mteb/models/model_implementations/vlm2vec_models.py +2 -0
  185. mteb/models/model_implementations/voyage_models.py +15 -0
  186. mteb/models/model_implementations/voyage_v.py +8 -2
  187. mteb/models/model_implementations/xyz_models.py +1 -0
  188. mteb/models/model_implementations/youtu_models.py +1 -0
  189. mteb/models/model_implementations/yuan_models.py +34 -0
  190. mteb/models/model_implementations/yuan_models_en.py +58 -0
  191. mteb/models/model_meta.py +442 -22
  192. mteb/models/search_encoder_index/__init__.py +7 -0
  193. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  194. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  195. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
  196. mteb/models/search_wrappers.py +165 -48
  197. mteb/models/sentence_transformer_wrapper.py +2 -7
  198. mteb/results/benchmark_results.py +88 -47
  199. mteb/results/model_result.py +11 -4
  200. mteb/results/task_result.py +37 -19
  201. mteb/similarity_functions.py +49 -0
  202. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  203. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  204. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  205. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  206. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  207. mteb/tasks/classification/ara/ajgt.py +1 -2
  208. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  209. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  210. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  211. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  212. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  213. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  214. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  215. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  216. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  217. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  218. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  219. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  220. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  221. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  222. mteb/tasks/classification/dan/dk_hate_classification.py +1 -2
  223. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  224. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  225. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  226. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  227. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  228. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  229. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  230. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  231. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  232. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  233. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  234. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  235. mteb/tasks/classification/eng/news_classification.py +1 -2
  236. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  237. mteb/tasks/classification/eng/patent_classification.py +1 -2
  238. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  239. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  240. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  241. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  242. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  243. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  244. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  245. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  246. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  247. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  248. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  249. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  250. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  251. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  252. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  253. mteb/tasks/classification/est/estonian_valence.py +1 -2
  254. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  255. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  256. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  257. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  258. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  259. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  260. mteb/tasks/classification/heb/__init__.py +6 -1
  261. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  262. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  263. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  264. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  265. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  266. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  267. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  268. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  269. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  270. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  271. mteb/tasks/classification/kor/klue_tc.py +1 -2
  272. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  273. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  274. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +1 -2
  275. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  276. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  277. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  278. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  279. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  280. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  281. mteb/tasks/classification/multilingual/scala_classification.py +1 -2
  282. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  283. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  284. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  285. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  286. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  287. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  288. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  289. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  290. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  291. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  292. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  293. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  294. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  295. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  296. mteb/tasks/classification/pol/polish_classification.py +3 -6
  297. mteb/tasks/classification/ron/moroco.py +1 -2
  298. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  299. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  300. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  301. mteb/tasks/classification/rus/headline_classification.py +1 -2
  302. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  303. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  304. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  305. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  306. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  307. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  308. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  309. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  310. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  311. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  312. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  313. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  314. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  315. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  316. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  317. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  318. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  319. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  320. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  321. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  322. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  323. mteb/tasks/classification/tur/__init__.py +4 -0
  324. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  325. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  326. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  327. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  328. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  329. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  330. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  331. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  332. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  333. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  334. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  335. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  336. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  337. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  338. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  339. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  340. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  341. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  342. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  343. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  344. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  345. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  346. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  347. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  348. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  349. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  350. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  351. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  352. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  353. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  354. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  355. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  356. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  357. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  358. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  359. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  360. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  361. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  362. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  363. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  364. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  365. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  366. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  367. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  368. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  369. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  370. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  371. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  372. mteb/tasks/pair_classification/rus/terra.py +51 -25
  373. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  374. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  375. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  376. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  377. mteb/tasks/reranking/jpn/__init__.py +9 -1
  378. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  379. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  380. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  381. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  382. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  383. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  384. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  385. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  386. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  387. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  388. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  389. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  390. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  391. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  392. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  393. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  394. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  395. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  396. mteb/tasks/retrieval/kor/__init__.py +2 -1
  397. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  398. mteb/tasks/retrieval/multilingual/__init__.py +22 -0
  399. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  400. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  401. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  402. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  403. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  404. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  405. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  406. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  407. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +399 -0
  408. mteb/tasks/retrieval/nld/__init__.py +8 -4
  409. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  410. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  411. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  412. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  413. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  414. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  415. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  416. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  417. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  418. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  419. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  420. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +1 -5
  421. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  422. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  423. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  424. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  425. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  426. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  427. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  428. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  429. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  430. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  431. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +1 -5
  432. mteb/tasks/retrieval/vie/fevervn_retrieval.py +1 -7
  433. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  434. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  435. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +1 -6
  436. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +1 -5
  437. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  438. mteb/tasks/retrieval/vie/nqvn_retrieval.py +1 -5
  439. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  440. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  441. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  442. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  443. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  444. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  445. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  446. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  447. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  448. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  449. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  450. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  451. mteb/types/_encoder_io.py +7 -2
  452. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/METADATA +11 -5
  453. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/RECORD +457 -391
  454. mteb/models/model_implementations/nb_sbert.py +0 -25
  455. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/WHEEL +0 -0
  456. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/entry_points.txt +0 -0
  457. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/licenses/LICENSE +0 -0
  458. {mteb-2.1.4.dist-info → mteb-2.5.2.dist-info}/top_level.txt +0 -0
@@ -193,12 +193,13 @@ NOMIC_CITATION = """
193
193
  """
194
194
 
195
195
  nomic_embed_v1_5 = ModelMeta(
196
- loader=NomicWrapper,
196
+ loader=NomicWrapper, # type: ignore
197
197
  loader_kwargs=dict(
198
198
  trust_remote_code=True,
199
199
  model_prompts=model_prompts,
200
200
  ),
201
201
  name="nomic-ai/nomic-embed-text-v1.5",
202
+ model_type=["dense"],
202
203
  languages=["eng-Latn"],
203
204
  open_weights=True,
204
205
  revision="b0753ae76394dd36bcfb912a46018088bca48be0",
@@ -221,12 +222,13 @@ nomic_embed_v1_5 = ModelMeta(
221
222
  )
222
223
 
223
224
  nomic_embed_v1 = ModelMeta(
224
- loader=NomicWrapper,
225
+ loader=NomicWrapper, # type: ignore
225
226
  loader_kwargs=dict(
226
227
  trust_remote_code=True,
227
228
  model_prompts=model_prompts,
228
229
  ),
229
230
  name="nomic-ai/nomic-embed-text-v1",
231
+ model_type=["dense"],
230
232
  languages=["eng-Latn"],
231
233
  open_weights=True,
232
234
  revision="0759316f275aa0cb93a5b830973843ca66babcf5",
@@ -249,12 +251,13 @@ nomic_embed_v1 = ModelMeta(
249
251
  )
250
252
 
251
253
  nomic_embed_v1_ablated = ModelMeta(
252
- loader=NomicWrapper,
254
+ loader=NomicWrapper, # type: ignore
253
255
  loader_kwargs=dict(
254
256
  trust_remote_code=True,
255
257
  model_prompts=model_prompts,
256
258
  ),
257
259
  name="nomic-ai/nomic-embed-text-v1-ablated",
260
+ model_type=["dense"],
258
261
  languages=["eng-Latn"],
259
262
  open_weights=True,
260
263
  revision="7d948905c5d5d3874fa55a925d68e49dbf411e5f",
@@ -276,12 +279,13 @@ nomic_embed_v1_ablated = ModelMeta(
276
279
  )
277
280
 
278
281
  nomic_embed_v1_unsupervised = ModelMeta(
279
- loader=NomicWrapper,
282
+ loader=NomicWrapper, # type: ignore
280
283
  loader_kwargs=dict(
281
284
  trust_remote_code=True,
282
285
  model_prompts=model_prompts,
283
286
  ),
284
287
  name="nomic-ai/nomic-embed-text-v1-unsupervised",
288
+ model_type=["dense"],
285
289
  languages=["eng-Latn"],
286
290
  open_weights=True,
287
291
  revision="b53d557b15ae63852847c222d336c1609eced93c",
@@ -309,6 +313,7 @@ nomic_modern_bert_embed = ModelMeta(
309
313
  model_prompts=model_prompts,
310
314
  ),
311
315
  name="nomic-ai/modernbert-embed-base",
316
+ model_type=["dense"],
312
317
  languages=["eng-Latn"],
313
318
  open_weights=True,
314
319
  revision="5960f1566fb7cb1adf1eb6e816639cf4646d9b12",
@@ -328,4 +333,151 @@ nomic_modern_bert_embed = ModelMeta(
328
333
  superseded_by=None,
329
334
  training_datasets=nomic_training_data,
330
335
  public_training_data=None,
336
+ citation="""@misc{nussbaum2024nomic,
337
+ title={Nomic Embed: Training a Reproducible Long Context Text Embedder},
338
+ author={Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
339
+ year={2024},
340
+ eprint={2402.01613},
341
+ archivePrefix={arXiv},
342
+ primaryClass={cs.CL}
343
+ }""",
344
+ )
345
+
346
+
347
+ m_languages = [
348
+ "eng-Latn",
349
+ "spa-Latn",
350
+ "fra-Latn",
351
+ "deu-Latn",
352
+ "ita-Latn",
353
+ "por-Latn",
354
+ "pol-Latn",
355
+ "nld-Latn",
356
+ "tur-Latn",
357
+ "jpn-Jpan",
358
+ "vie-Latn",
359
+ "rus-Cyrl",
360
+ "ind-Latn",
361
+ "arb-Arab",
362
+ "ces-Latn",
363
+ "ron-Latn",
364
+ "swe-Latn",
365
+ "ell-Grek",
366
+ "ukr-Cyrl",
367
+ "zho-Hans",
368
+ "hun-Latn",
369
+ "dan-Latn",
370
+ "nor-Latn",
371
+ "hin-Deva",
372
+ "fin-Latn",
373
+ "bul-Cyrl",
374
+ "kor-Hang",
375
+ "slk-Latn",
376
+ "tha-Thai",
377
+ "heb-Hebr",
378
+ "cat-Latn",
379
+ "lit-Latn",
380
+ "fas-Arab",
381
+ "msa-Latn",
382
+ "slv-Latn",
383
+ "lav-Latn",
384
+ "mar-Deva",
385
+ "ben-Beng",
386
+ "sqi-Latn",
387
+ "cym-Latn",
388
+ "bel-Cyrl",
389
+ "mal-Mlym",
390
+ "kan-Knda",
391
+ "mkd-Cyrl",
392
+ "urd-Arab",
393
+ "fry-Latn",
394
+ "fil-Latn",
395
+ "tel-Telu",
396
+ "eus-Latn",
397
+ "swh-Latn",
398
+ "som-Latn",
399
+ "snd-Arab",
400
+ "uzb-Latn",
401
+ "cos-Latn",
402
+ "hrv-Latn",
403
+ "guj-Gujr",
404
+ "hin-Latn",
405
+ "ceb-Latn",
406
+ "epo-Latn",
407
+ "jav-Latn",
408
+ "lat-Latn",
409
+ "zul-Latn",
410
+ "mon-Cyrl",
411
+ "sin-Sinh",
412
+ "ell-Latn",
413
+ "gle-Latn",
414
+ "kir-Cyrl",
415
+ "tgk-Cyrl",
416
+ "mya-Mymr",
417
+ "khm-Khmr",
418
+ "mlg-Latn",
419
+ "pan-Guru",
420
+ "rus-Latn",
421
+ "sna-Latn",
422
+ "zho-Latn",
423
+ "hau-Latn",
424
+ "heb-Latn",
425
+ "hmn-Latn",
426
+ "hat-Latn",
427
+ "jpn-Latn",
428
+ "sun-Latn",
429
+ "bul-Latn",
430
+ "gla-Latn",
431
+ "nya-Latn",
432
+ "pus-Arab",
433
+ "kur-Latn",
434
+ "hbs-Latn",
435
+ "amh-Ethi",
436
+ "ibo-Latn",
437
+ "lao-Laoo",
438
+ "mri-Latn",
439
+ "nno-Latn",
440
+ "smo-Latn",
441
+ "yid-Hebr",
442
+ "sot-Latn",
443
+ "tgl-Latn",
444
+ "xho-Latn",
445
+ "yor-Latn",
446
+ ]
447
+
448
+ nomic_embed_text_v2_moe = ModelMeta(
449
+ loader=NomicWrapper, # type: ignore
450
+ loader_kwargs=dict(
451
+ trust_remote_code=True,
452
+ model_prompts=model_prompts,
453
+ ),
454
+ name="nomic-ai/nomic-embed-text-v2-moe",
455
+ model_type=["dense"],
456
+ languages=m_languages,
457
+ open_weights=True,
458
+ revision="1066b6599d099fbb93dfcb64f9c37a7c9e503e85",
459
+ release_date="2025-02-07",
460
+ n_parameters=475292928,
461
+ memory_usage_mb=1813,
462
+ max_tokens=512,
463
+ embed_dim=768,
464
+ license="apache-2.0",
465
+ reference="https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe",
466
+ similarity_fn_name=ScoringFunction.COSINE,
467
+ framework=["Sentence Transformers", "PyTorch"],
468
+ use_instructions=True,
469
+ adapted_from="nomic-ai/nomic-xlm-2048",
470
+ public_training_data="https://github.com/nomic-ai/contrastors?tab=readme-ov-file#data-access",
471
+ public_training_code="https://github.com/nomic-ai/contrastors/blob/613ddfd37309e538cceadb05b1e6423e7b09f603/src/contrastors/configs/train/contrastive_finetune_moe.yaml",
472
+ training_datasets=None, # did not look into this further
473
+ superseded_by=None,
474
+ citation="""@misc{nussbaum2025trainingsparsemixtureexperts,
475
+ title={Training Sparse Mixture Of Experts Text Embedding Models},
476
+ author={Zach Nussbaum and Brandon Duderstadt},
477
+ year={2025},
478
+ eprint={2502.07972},
479
+ archivePrefix={arXiv},
480
+ primaryClass={cs.CL},
481
+ url={https://arxiv.org/abs/2502.07972},
482
+ }""",
331
483
  )
@@ -1,8 +1,9 @@
1
- from typing import Any
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import torch
4
6
  import torch.nn.functional as F
5
- from PIL import Image
6
7
  from torch.utils.data import DataLoader
7
8
  from tqdm.auto import tqdm
8
9
 
@@ -12,6 +13,9 @@ from mteb.models.abs_encoder import AbsEncoder
12
13
  from mteb.models.model_meta import ModelMeta, ScoringFunction
13
14
  from mteb.types import Array, BatchedInput, PromptType
14
15
 
16
+ if TYPE_CHECKING:
17
+ from PIL import Image
18
+
15
19
  NOMIC_EMBED_VISION_CITATION = """@article{nussbaum2024nomicembedvision,
16
20
  title={Nomic Embed Vision: Expanding the Latent Space},
17
21
  author={Nussbaum, Zach and Duderstadt, Brandon and Mulyar, Andriy},
@@ -164,6 +168,7 @@ nomic_embed_vision_v1_5 = ModelMeta(
164
168
  "text_model_revision": "a03db6748c80237063eb0546ac6b627eca2318cb",
165
169
  },
166
170
  name="nomic-ai/nomic-embed-vision-v1.5",
171
+ model_type=["dense"],
167
172
  languages=["eng-Latn"],
168
173
  revision="af2246fffdab78d8458418480e4886a8e48b70a7",
169
174
  release_date="2024-06-08",
@@ -1,7 +1,6 @@
1
- from typing import Any
1
+ from typing import TYPE_CHECKING, Any
2
2
 
3
3
  import torch
4
- from PIL import Image
5
4
  from torch.utils.data import DataLoader
6
5
 
7
6
  from mteb.abstasks.task_metadata import TaskMetadata
@@ -9,6 +8,10 @@ from mteb.models.abs_encoder import AbsEncoder
9
8
  from mteb.models.model_meta import ModelMeta
10
9
  from mteb.types import Array, BatchedInput, PromptType
11
10
 
11
+ if TYPE_CHECKING:
12
+ pass
13
+
14
+
12
15
  LLAMA_NEMORETRIEVER_CITATION = """@misc{xu2025llamanemoretrievercolembedtopperforming,
13
16
  title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
14
17
  author={Mengyao Xu and Gabriel Moreira and Ronay Ak and Radek Osmulski and Yauhen Babakhin and Zhiding Yu and Benedikt Schifferer and Even Oldridge},
@@ -53,6 +56,7 @@ class LlamaNemoretrieverColembed(AbsEncoder):
53
56
  **kwargs,
54
57
  ):
55
58
  import torchvision.transforms.functional as F
59
+ from PIL import Image
56
60
 
57
61
  all_images = []
58
62
  if isinstance(images, DataLoader):
@@ -61,14 +65,16 @@ class LlamaNemoretrieverColembed(AbsEncoder):
61
65
  iterator = DataLoader(images, batch_size=batch_size)
62
66
 
63
67
  for batch in iterator:
64
- for b in batch:
68
+ for image in batch["image"]:
65
69
  pil_img = (
66
- F.to_pil_image(b.to("cpu")) if not isinstance(b, Image.Image) else b
70
+ image
71
+ if isinstance(image, Image.Image)
72
+ else F.to_pil_image(image.to("cpu"))
67
73
  )
68
74
  all_images.append(pil_img)
69
75
 
70
76
  batch_size = 1
71
- return self.model.forward_passages(all_images, batch_size=batch_size)
77
+ return self.model.forward_images(all_images, batch_size=batch_size)
72
78
 
73
79
  def calculate_probs(self, text_embeddings, image_embeddings):
74
80
  scores = self.similarity(text_embeddings, image_embeddings)
@@ -117,19 +123,18 @@ class LlamaNemoretrieverColembed(AbsEncoder):
117
123
 
118
124
  TRAINING_DATA = {
119
125
  # from https://huggingface.co/datasets/vidore/colpali_train_set
120
- "DocVQA",
121
- "InfoVQA",
122
- "TATDQA",
123
- "arXivQA",
124
- "hotpotqa",
125
- "miracl",
126
+ "VidoreDocVQARetrieval",
127
+ "VidoreInfoVQARetrieval",
128
+ "VidoreTatdqaRetrieval",
129
+ "VidoreArxivQARetrieval",
130
+ "HotpotQA",
131
+ "MIRACLRetrieval",
126
132
  "NQ",
127
- "stackexchange",
133
+ "StackExchangeClustering",
128
134
  "SQuAD",
129
135
  "WebInstructSub",
130
136
  "docmatix-ir",
131
- "vdr-multilingual-train",
132
- "colpali_train_set", # as it contains PDFs
137
+ "VDRMultilingualRetrieval",
133
138
  "VisRAG-Ret-Train-Synthetic-data",
134
139
  "VisRAG-Ret-Train-In-domain-data",
135
140
  "wiki-ss-nq",
@@ -141,12 +146,13 @@ llama_nemoretriever_colembed_1b_v1 = ModelMeta(
141
146
  trust_remote_code=True,
142
147
  ),
143
148
  name="nvidia/llama-nemoretriever-colembed-1b-v1",
149
+ model_type=["late-interaction"],
144
150
  languages=["eng-Latn"],
145
151
  revision="1f0fdea7f5b19532a750be109b19072d719b8177",
146
152
  release_date="2025-06-27",
147
153
  modalities=["image", "text"],
148
154
  n_parameters=2_418_000_000,
149
- memory_usage_mb=9224,
155
+ memory_usage_mb=4610,
150
156
  max_tokens=8192,
151
157
  embed_dim=2048,
152
158
  license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE",
@@ -167,12 +173,13 @@ llama_nemoretriever_colembed_3b_v1 = ModelMeta(
167
173
  trust_remote_code=True,
168
174
  ),
169
175
  name="nvidia/llama-nemoretriever-colembed-3b-v1",
176
+ model_type=["late-interaction"],
170
177
  languages=["eng-Latn"],
171
178
  revision="50c36f4d5271c6851aa08bd26d69f6e7ca8b870c",
172
179
  release_date="2025-06-27",
173
180
  modalities=["image", "text"],
174
181
  n_parameters=4_407_000_000,
175
- memory_usage_mb=16811,
182
+ memory_usage_mb=8403,
176
183
  max_tokens=8192,
177
184
  embed_dim=3072,
178
185
  license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE",
@@ -111,6 +111,7 @@ NV_embed_v2 = ModelMeta(
111
111
  add_eos_token=True,
112
112
  ),
113
113
  name="nvidia/NV-Embed-v2",
114
+ model_type=["dense"],
114
115
  languages=["eng-Latn"],
115
116
  open_weights=True,
116
117
  revision="7604d305b621f14095a1aa23d351674c2859553a",
@@ -141,12 +142,13 @@ NV_embed_v1 = ModelMeta(
141
142
  add_eos_token=True,
142
143
  ),
143
144
  name="nvidia/NV-Embed-v1",
145
+ model_type=["dense"],
144
146
  languages=["eng-Latn"],
145
147
  open_weights=True,
146
148
  revision="570834afd5fef5bf3a3c2311a2b6e0a66f6f4f2c",
147
149
  release_date="2024-09-13", # initial commit of hf model.
148
150
  n_parameters=7_850_000_000,
149
- memory_usage_mb=29945,
151
+ memory_usage_mb=14975,
150
152
  embed_dim=4096,
151
153
  license="cc-by-nc-4.0",
152
154
  max_tokens=32768,
@@ -528,6 +530,7 @@ class LlamaEmbedNemotron(AbsEncoder):
528
530
  llama_embed_nemotron_8b = ModelMeta(
529
531
  loader=LlamaEmbedNemotron,
530
532
  name="nvidia/llama-embed-nemotron-8b",
533
+ model_type=["dense"],
531
534
  languages=llama_embed_nemotron_evaluated_languages,
532
535
  open_weights=True,
533
536
  revision="84a375593d27d3528beb4e104822515659e093b4",
@@ -0,0 +1,195 @@
1
+ from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
2
+ from mteb.models.model_meta import ModelMeta
3
+ from mteb.models.models_protocols import PromptType
4
+
5
+
6
+ def instruction_template(
7
+ instruction: str, prompt_type: PromptType | None = None
8
+ ) -> str:
9
+ if (
10
+ prompt_type == PromptType.document
11
+ ): # to avoid this issue: https://huggingface.co/Qwen/Qwen3-Embedding-8B/discussions/21
12
+ return " "
13
+ if not instruction:
14
+ return ""
15
+ if isinstance(instruction, dict):
16
+ if prompt_type is None:
17
+ instruction = next(iter(instruction.values())) # TODO
18
+ else:
19
+ instruction = instruction[prompt_type]
20
+ return f"Instruct: {instruction}\nQuery:"
21
+
22
+
23
+ multilingual_langs = [
24
+ "afr-Latn",
25
+ "ara-Arab",
26
+ "aze-Latn",
27
+ "bel-Cyrl",
28
+ "bul-Cyrl",
29
+ "ben-Beng",
30
+ "cat-Latn",
31
+ "ceb-Latn",
32
+ "ces-Latn",
33
+ "cym-Latn",
34
+ "dan-Latn",
35
+ "deu-Latn",
36
+ "ell-Grek",
37
+ "eng-Latn",
38
+ "spa-Latn",
39
+ "est-Latn",
40
+ "eus-Latn",
41
+ "fas-Arab",
42
+ "fin-Latn",
43
+ "fra-Latn",
44
+ "glg-Latn",
45
+ "guj-Gujr",
46
+ "heb-Hebr",
47
+ "hin-Deva",
48
+ "hrv-Latn",
49
+ "hat-Latn",
50
+ "hun-Latn",
51
+ "hye-Armn",
52
+ "ind-Latn",
53
+ "isl-Latn",
54
+ "ita-Latn",
55
+ "jpn-Jpan",
56
+ "jav-Latn",
57
+ "kat-Geor",
58
+ "kaz-Cyrl",
59
+ "khm-Khmr",
60
+ "kan-Knda",
61
+ "kor-Hang",
62
+ "kir-Cyrl",
63
+ "lao-Laoo",
64
+ "lit-Latn",
65
+ "lav-Latn",
66
+ "mkd-Cyrl",
67
+ "mal-Mlym",
68
+ "mon-Cyrl",
69
+ "mar-Deva",
70
+ "msa-Latn",
71
+ "mya-Mymr",
72
+ "nep-Deva",
73
+ "nld-Latn",
74
+ "nor-Latn",
75
+ "nob-Latn",
76
+ "nno-Latn",
77
+ "pan-Guru",
78
+ "pol-Latn",
79
+ "por-Latn",
80
+ "que-Latn",
81
+ "ron-Latn",
82
+ "rus-Cyrl",
83
+ "sin-Sinh",
84
+ "slk-Latn",
85
+ "slv-Latn",
86
+ "swa-Latn",
87
+ "tam-Taml",
88
+ "tel-Telu",
89
+ "tha-Thai",
90
+ "tgl-Latn",
91
+ "tur-Latn",
92
+ "ukr-Cyrl",
93
+ "urd-Arab",
94
+ "vie-Latn",
95
+ "yor-Latn",
96
+ "zho-Hans",
97
+ ]
98
+
99
+ OCTEN_CITATION = """@misc{octen-embedding-2025,
100
+ title={Octen-Embedding-8B: A Fine-tuned Multilingual Text Embedding Model},
101
+ author={Octen Team},
102
+ year={2025},
103
+ url={https://huggingface.co/bflhc/bflhc/Octen-Embedding-8B}
104
+ }"""
105
+
106
+ training_data = {
107
+ "T2Retrieval",
108
+ "DuRetrieval",
109
+ "MMarcoReranking",
110
+ "CMedQAv2-reranking",
111
+ "NQ",
112
+ "MSMARCO",
113
+ "HotpotQA",
114
+ "FEVER",
115
+ "MrTidyRetrieval",
116
+ "MIRACLRetrieval",
117
+ "CodeSearchNet",
118
+ }
119
+
120
+ # Predefined prompts for various RTEB tasks
121
+ _PREDEFINED_PROMPTS = {
122
+ # ========== Open Datasets ==========
123
+ # Legal domain
124
+ "AILACasedocs": "Given a legal case scenario, retrieve the most relevant case documents",
125
+ "AILAStatutes": "Given a legal scenario, retrieve the most relevant statute documents",
126
+ "LegalQuAD": "Given a legal question, retrieve relevant legal documents that answer the question",
127
+ "LegalSummarization": "Given a query, retrieve relevant legal documents for summarization",
128
+ # Code domain
129
+ "AppsRetrieval": "Given a query about mobile applications, retrieve relevant app information",
130
+ "HumanEvalRetrieval": "Given a code problem description, retrieve relevant code examples",
131
+ "MBPPRetrieval": "Given a programming problem description, retrieve relevant code solutions",
132
+ "DS1000Retrieval": "Given a data science problem, retrieve relevant code snippets",
133
+ "FreshStackRetrieval": "Given a programming question, retrieve relevant Stack Overflow posts",
134
+ # Finance domain
135
+ "FinQARetrieval": "Given a financial question, retrieve relevant financial documents",
136
+ "FinanceBenchRetrieval": "Given a financial query, retrieve relevant financial information",
137
+ "HC3FinanceRetrieval": "Given a finance-related query, retrieve relevant documents",
138
+ # Medical domain
139
+ "CUREv1": "Given a medical query, retrieve relevant clinical documents",
140
+ "ChatDoctorRetrieval": "Given a medical question, retrieve relevant medical information",
141
+ # SQL domain
142
+ "WikiSQLRetrieval": "Given a natural language query, retrieve relevant SQL examples",
143
+ # Multilingual
144
+ "MIRACLRetrievalHardNegatives": "Given a question, retrieve Wikipedia passages that answer the question",
145
+ # ========== Private/Closed Datasets ==========
146
+ # Code domain (Private)
147
+ "Code1Retrieval": "Given a code problem description, retrieve relevant code examples",
148
+ "JapaneseCode1Retrieval": "Given a code problem description, retrieve relevant code examples",
149
+ # Finance domain (Private)
150
+ "EnglishFinance1Retrieval": "Given a financial query, retrieve relevant financial documents",
151
+ "EnglishFinance2Retrieval": "Given a financial query, retrieve relevant financial documents",
152
+ "EnglishFinance3Retrieval": "Given a financial query, retrieve relevant financial documents",
153
+ "EnglishFinance4Retrieval": "Given a financial query, retrieve relevant financial documents",
154
+ # Healthcare domain (Private)
155
+ "EnglishHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
156
+ "GermanHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
157
+ # Legal domain (Private)
158
+ "FrenchLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
159
+ "GermanLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
160
+ "JapaneseLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
161
+ # General/Multilingual (Private)
162
+ "French1Retrieval": "Given a query, retrieve relevant passages",
163
+ "German1Retrieval": "Given a query, retrieve relevant passages",
164
+ }
165
+
166
+
167
+ Octen_Embedding_8B = ModelMeta(
168
+ loader=InstructSentenceTransformerModel,
169
+ loader_kwargs=dict(
170
+ instruction_template=instruction_template,
171
+ apply_instruction_to_passages=True,
172
+ prompts_dict=_PREDEFINED_PROMPTS,
173
+ max_seq_length=18480,
174
+ model_kwargs={"torch_dtype": "bfloat16"},
175
+ ),
176
+ name="bflhc/Octen-Embedding-8B",
177
+ languages=multilingual_langs,
178
+ open_weights=True,
179
+ revision="2030603c2926ab005fafd824fac5911e271be21f",
180
+ release_date="2025-12-23",
181
+ n_parameters=7567295488,
182
+ memory_usage_mb=14433,
183
+ embed_dim=4096,
184
+ max_tokens=32768,
185
+ license="apache-2.0",
186
+ reference="https://huggingface.co/bflhc/Octen-Embedding-8B",
187
+ similarity_fn_name="cosine",
188
+ framework=["Sentence Transformers", "PyTorch"],
189
+ use_instructions=True,
190
+ public_training_code=None,
191
+ public_training_data=None,
192
+ training_datasets=training_data,
193
+ citation=OCTEN_CITATION,
194
+ adapted_from="Qwen/Qwen3-Embedding-8B",
195
+ )
@@ -91,10 +91,6 @@ class OpenAIModel(AbsEncoder):
91
91
 
92
92
  from openai import NotGiven
93
93
 
94
- if self.model_name == "text-embedding-ada-002" and self._embed_dim is not None:
95
- logger.warning(
96
- "Reducing embedding size available only for text-embedding-3-* models"
97
- )
98
94
  sentences = [text for batch in inputs for text in batch["text"]]
99
95
 
100
96
  mask_sents = [(i, t) for i, t in enumerate(sentences) if t.strip()]
@@ -122,13 +118,22 @@ class OpenAIModel(AbsEncoder):
122
118
 
123
119
  no_empty_embeddings = []
124
120
 
121
+ # Set dimensions only for models that support it
122
+ dimensions = (
123
+ self._embed_dim or NotGiven()
124
+ if not self.model_name == "text-embedding-ada-002"
125
+ else NotGiven()
126
+ )
127
+ default_kwargs = dict(
128
+ model=self.model_name,
129
+ encoding_format="float",
130
+ dimensions=dimensions,
131
+ )
132
+
125
133
  for sublist in tqdm(sublists, leave=False, disable=not show_progress_bar):
126
134
  try:
127
135
  response = self._client.embeddings.create(
128
- input=sublist,
129
- model=self.model_name,
130
- encoding_format="float",
131
- dimensions=self._embed_dim or NotGiven(),
136
+ input=sublist, **default_kwargs
132
137
  )
133
138
  except Exception as e:
134
139
  # Sleep due to too many requests
@@ -138,19 +143,13 @@ class OpenAIModel(AbsEncoder):
138
143
  time.sleep(10)
139
144
  try:
140
145
  response = self._client.embeddings.create(
141
- input=sublist,
142
- model=self.model_name,
143
- encoding_format="float",
144
- dimensions=self._embed_dim or NotGiven(),
146
+ input=sublist, **default_kwargs
145
147
  )
146
148
  except Exception as e:
147
149
  logger.info("Sleeping for 60 seconds due to error", e)
148
150
  time.sleep(60)
149
151
  response = self._client.embeddings.create(
150
- input=sublist,
151
- model=self.model_name,
152
- encoding_format="float",
153
- dimensions=self._embed_dim or NotGiven(),
152
+ input=sublist, **default_kwargs
154
153
  )
155
154
  no_empty_embeddings.extend(self._to_numpy(response))
156
155
 
@@ -168,6 +167,7 @@ class OpenAIModel(AbsEncoder):
168
167
 
169
168
  text_embedding_3_small = ModelMeta(
170
169
  name="openai/text-embedding-3-small",
170
+ model_type=["dense"],
171
171
  revision="3",
172
172
  release_date="2024-01-25",
173
173
  languages=None, # supported languages not specified
@@ -192,6 +192,7 @@ text_embedding_3_small = ModelMeta(
192
192
  )
193
193
  text_embedding_3_large = ModelMeta(
194
194
  name="openai/text-embedding-3-large",
195
+ model_type=["dense"],
195
196
  revision="3",
196
197
  release_date="2024-01-25",
197
198
  languages=None, # supported languages not specified
@@ -216,6 +217,7 @@ text_embedding_3_large = ModelMeta(
216
217
  )
217
218
  text_embedding_ada_002 = ModelMeta(
218
219
  name="openai/text-embedding-ada-002",
220
+ model_type=["dense"],
219
221
  revision="3",
220
222
  release_date="2022-12-15",
221
223
  languages=None, # supported languages not specified
@@ -241,6 +243,7 @@ text_embedding_ada_002 = ModelMeta(
241
243
 
242
244
  text_embedding_3_small_512 = ModelMeta(
243
245
  name="openai/text-embedding-3-small (embed_dim=512)",
246
+ model_type=["dense"],
244
247
  revision="3",
245
248
  release_date="2024-01-25",
246
249
  languages=None, # supported languages not specified
@@ -267,6 +270,7 @@ text_embedding_3_small_512 = ModelMeta(
267
270
 
268
271
  text_embedding_3_large_512 = ModelMeta(
269
272
  name="openai/text-embedding-3-large (embed_dim=512)",
273
+ model_type=["dense"],
270
274
  revision="3",
271
275
  release_date="2024-01-25",
272
276
  languages=None, # supported languages not specified