mteb 2.1.4__py3-none-any.whl → 2.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. mteb/__init__.py +6 -0
  2. mteb/_create_dataloaders.py +22 -20
  3. mteb/_evaluators/any_sts_evaluator.py +23 -14
  4. mteb/_evaluators/classification_metrics.py +54 -0
  5. mteb/_evaluators/clustering_evaluator.py +3 -3
  6. mteb/_evaluators/evaluator.py +4 -2
  7. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +18 -11
  8. mteb/_evaluators/pair_classification_evaluator.py +34 -40
  9. mteb/_evaluators/retrieval_evaluator.py +2 -2
  10. mteb/_evaluators/retrieval_metrics.py +18 -17
  11. mteb/_evaluators/sklearn_evaluator.py +25 -37
  12. mteb/_evaluators/text/bitext_mining_evaluator.py +31 -19
  13. mteb/_evaluators/text/summarization_evaluator.py +27 -20
  14. mteb/_evaluators/zeroshot_classification_evaluator.py +7 -5
  15. mteb/abstasks/_data_filter/__init__.py +0 -0
  16. mteb/abstasks/_data_filter/filters.py +125 -0
  17. mteb/abstasks/_data_filter/task_pipelines.py +105 -0
  18. mteb/abstasks/_statistics_calculation.py +23 -11
  19. mteb/abstasks/_stratification.py +18 -18
  20. mteb/abstasks/abstask.py +35 -28
  21. mteb/abstasks/aggregate_task_metadata.py +1 -9
  22. mteb/abstasks/aggregated_task.py +10 -29
  23. mteb/abstasks/classification.py +15 -12
  24. mteb/abstasks/clustering.py +20 -16
  25. mteb/abstasks/clustering_legacy.py +13 -10
  26. mteb/abstasks/image/image_text_pair_classification.py +7 -4
  27. mteb/abstasks/multilabel_classification.py +33 -22
  28. mteb/abstasks/pair_classification.py +27 -11
  29. mteb/abstasks/regression.py +4 -4
  30. mteb/abstasks/retrieval.py +28 -24
  31. mteb/abstasks/retrieval_dataset_loaders.py +2 -2
  32. mteb/abstasks/sts.py +14 -4
  33. mteb/abstasks/task_metadata.py +32 -33
  34. mteb/abstasks/text/bitext_mining.py +39 -28
  35. mteb/abstasks/text/reranking.py +8 -6
  36. mteb/abstasks/text/summarization.py +10 -5
  37. mteb/abstasks/zeroshot_classification.py +8 -4
  38. mteb/benchmarks/_create_table.py +84 -37
  39. mteb/benchmarks/benchmark.py +77 -16
  40. mteb/benchmarks/benchmarks/__init__.py +12 -0
  41. mteb/benchmarks/benchmarks/benchmarks.py +361 -16
  42. mteb/benchmarks/get_benchmark.py +14 -53
  43. mteb/cache.py +227 -37
  44. mteb/cli/_display_tasks.py +2 -2
  45. mteb/cli/build_cli.py +110 -14
  46. mteb/cli/generate_model_card.py +43 -23
  47. mteb/deprecated_evaluator.py +71 -62
  48. mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
  49. mteb/descriptive_stats/Classification/HebrewSentimentAnalysis.v3.json +60 -0
  50. mteb/descriptive_stats/Classification/TurkishConstitutionalCourtViolation.json +54 -0
  51. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  52. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  53. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  54. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  55. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3ComputerScienceRetrieval.json +214 -0
  56. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3EnergyRetrieval.json +214 -0
  57. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceEnRetrieval.json +214 -0
  58. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3FinanceFrRetrieval.json +214 -0
  59. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3HrRetrieval.json +214 -0
  60. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3IndustrialRetrieval.json +214 -0
  61. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3NuclearRetrieval.json +214 -0
  62. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PharmaceuticalsRetrieval.json +214 -0
  63. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3PhysicsRetrieval.json +214 -0
  64. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore3TelecomRetrieval.json +214 -0
  65. mteb/descriptive_stats/PairClassification/TERRa.V2.json +35 -0
  66. mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
  67. mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
  68. mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
  69. mteb/descriptive_stats/Retrieval/ArguAna-NL.v2.json +30 -0
  70. mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
  72. mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
  73. mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
  74. mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
  75. mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
  76. mteb/descriptive_stats/Retrieval/NFCorpus-NL.v2.json +30 -0
  77. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  78. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  79. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  80. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  81. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  82. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  83. mteb/descriptive_stats/Retrieval/SCIDOCS-NL.v2.json +30 -0
  84. mteb/descriptive_stats/Retrieval/SQuADKorV1Retrieval.json +30 -0
  85. mteb/descriptive_stats/Retrieval/SciFact-NL.v2.json +30 -0
  86. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  87. mteb/evaluate.py +106 -75
  88. mteb/filter_tasks.py +25 -26
  89. mteb/get_tasks.py +29 -30
  90. mteb/languages/language_scripts.py +5 -3
  91. mteb/leaderboard/app.py +414 -151
  92. mteb/leaderboard/benchmark_selector.py +14 -5
  93. mteb/leaderboard/figures.py +13 -15
  94. mteb/leaderboard/table.py +82 -17
  95. mteb/load_results.py +12 -12
  96. mteb/models/__init__.py +4 -1
  97. mteb/models/abs_encoder.py +31 -23
  98. mteb/models/cache_wrappers/__init__.py +2 -1
  99. mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
  100. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +7 -6
  101. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
  102. mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
  103. mteb/models/cache_wrappers/cache_wrapper.py +3 -3
  104. mteb/models/get_model_meta.py +25 -118
  105. mteb/models/instruct_wrapper.py +33 -9
  106. mteb/models/model_implementations/align_models.py +8 -1
  107. mteb/models/model_implementations/amazon_models.py +1 -0
  108. mteb/models/model_implementations/andersborges.py +65 -0
  109. mteb/models/model_implementations/ara_models.py +9 -1
  110. mteb/models/model_implementations/arctic_models.py +16 -8
  111. mteb/models/model_implementations/b1ade_models.py +2 -1
  112. mteb/models/model_implementations/bedrock_models.py +4 -0
  113. mteb/models/model_implementations/bge_models.py +101 -17
  114. mteb/models/model_implementations/bica_model.py +35 -0
  115. mteb/models/model_implementations/blip2_models.py +13 -2
  116. mteb/models/model_implementations/blip_models.py +43 -16
  117. mteb/models/model_implementations/bm25.py +5 -4
  118. mteb/models/model_implementations/bmretriever_models.py +10 -4
  119. mteb/models/model_implementations/cadet_models.py +10 -1
  120. mteb/models/model_implementations/cde_models.py +25 -4
  121. mteb/models/model_implementations/clip_models.py +9 -6
  122. mteb/models/model_implementations/clips_models.py +100 -0
  123. mteb/models/model_implementations/codefuse_models.py +165 -3
  124. mteb/models/model_implementations/codesage_models.py +18 -3
  125. mteb/models/model_implementations/cohere_models.py +13 -6
  126. mteb/models/model_implementations/cohere_v.py +7 -2
  127. mteb/models/model_implementations/colpali_models.py +17 -9
  128. mteb/models/model_implementations/colqwen_models.py +275 -5
  129. mteb/models/model_implementations/colsmol_models.py +4 -2
  130. mteb/models/model_implementations/conan_models.py +2 -1
  131. mteb/models/model_implementations/dino_models.py +194 -23
  132. mteb/models/model_implementations/e5_instruct.py +27 -4
  133. mteb/models/model_implementations/e5_models.py +21 -110
  134. mteb/models/model_implementations/e5_v.py +7 -6
  135. mteb/models/model_implementations/eagerworks_models.py +164 -0
  136. mteb/models/model_implementations/emillykkejensen_models.py +91 -0
  137. mteb/models/model_implementations/en_code_retriever.py +2 -1
  138. mteb/models/model_implementations/euler_models.py +32 -0
  139. mteb/models/model_implementations/evaclip_models.py +4 -0
  140. mteb/models/model_implementations/fa_models.py +67 -9
  141. mteb/models/model_implementations/facebookai.py +205 -0
  142. mteb/models/model_implementations/geogpt_models.py +2 -1
  143. mteb/models/model_implementations/gme_v_models.py +17 -10
  144. mteb/models/model_implementations/google_models.py +17 -6
  145. mteb/models/model_implementations/granite_vision_embedding_models.py +8 -3
  146. mteb/models/model_implementations/gritlm_models.py +4 -2
  147. mteb/models/model_implementations/gte_models.py +99 -9
  148. mteb/models/model_implementations/hinvec_models.py +2 -1
  149. mteb/models/model_implementations/human.py +1 -0
  150. mteb/models/model_implementations/ibm_granite_models.py +36 -6
  151. mteb/models/model_implementations/inf_models.py +4 -2
  152. mteb/models/model_implementations/jasper_models.py +256 -3
  153. mteb/models/model_implementations/jina_clip.py +49 -10
  154. mteb/models/model_implementations/jina_models.py +222 -11
  155. mteb/models/model_implementations/kalm_models.py +203 -25
  156. mteb/models/model_implementations/kblab.py +37 -0
  157. mteb/models/model_implementations/kennethenevoldsen_models.py +74 -0
  158. mteb/models/model_implementations/kfst.py +25 -0
  159. mteb/models/model_implementations/kowshik24_models.py +32 -0
  160. mteb/models/model_implementations/lens_models.py +2 -0
  161. mteb/models/model_implementations/lgai_embedding_models.py +2 -1
  162. mteb/models/model_implementations/linq_models.py +4 -3
  163. mteb/models/model_implementations/listconranker.py +2 -2
  164. mteb/models/model_implementations/llm2clip_models.py +9 -6
  165. mteb/models/model_implementations/llm2vec_models.py +16 -8
  166. mteb/models/model_implementations/mcinext_models.py +7 -1
  167. mteb/models/model_implementations/mdbr_models.py +19 -3
  168. mteb/models/model_implementations/misc_models.py +422 -60
  169. mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
  170. mteb/models/model_implementations/mme5_models.py +2 -1
  171. mteb/models/model_implementations/moco_models.py +15 -4
  172. mteb/models/model_implementations/mod_models.py +191 -0
  173. mteb/models/model_implementations/model2vec_models.py +27 -14
  174. mteb/models/model_implementations/moka_models.py +4 -1
  175. mteb/models/model_implementations/nbailab.py +70 -0
  176. mteb/models/model_implementations/no_instruct_sentence_models.py +3 -2
  177. mteb/models/model_implementations/nomic_models.py +173 -6
  178. mteb/models/model_implementations/nomic_models_vision.py +8 -3
  179. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +32 -19
  180. mteb/models/model_implementations/nvidia_models.py +155 -20
  181. mteb/models/model_implementations/octen_models.py +254 -0
  182. mteb/models/model_implementations/openai_models.py +20 -16
  183. mteb/models/model_implementations/openclip_models.py +37 -13
  184. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -5
  185. mteb/models/model_implementations/ops_moa_models.py +5 -3
  186. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  187. mteb/models/model_implementations/pawan_models.py +39 -0
  188. mteb/models/model_implementations/piccolo_models.py +9 -1
  189. mteb/models/model_implementations/pixie_models.py +56 -0
  190. mteb/models/model_implementations/promptriever_models.py +12 -8
  191. mteb/models/model_implementations/pylate_models.py +46 -12
  192. mteb/models/model_implementations/qodo_models.py +4 -2
  193. mteb/models/model_implementations/qtack_models.py +2 -1
  194. mteb/models/model_implementations/qwen3_models.py +9 -6
  195. mteb/models/model_implementations/qzhou_models.py +5 -3
  196. mteb/models/model_implementations/random_baseline.py +19 -24
  197. mteb/models/model_implementations/rasgaard_models.py +34 -0
  198. mteb/models/model_implementations/reasonir_model.py +2 -1
  199. mteb/models/model_implementations/repllama_models.py +5 -3
  200. mteb/models/model_implementations/rerankers_custom.py +15 -9
  201. mteb/models/model_implementations/rerankers_monot5_based.py +31 -31
  202. mteb/models/model_implementations/richinfoai_models.py +2 -1
  203. mteb/models/model_implementations/ru_sentence_models.py +71 -20
  204. mteb/models/model_implementations/ruri_models.py +322 -0
  205. mteb/models/model_implementations/salesforce_models.py +6 -3
  206. mteb/models/model_implementations/samilpwc_models.py +2 -1
  207. mteb/models/model_implementations/sarashina_embedding_models.py +168 -0
  208. mteb/models/model_implementations/searchmap_models.py +2 -1
  209. mteb/models/model_implementations/seed_1_6_embedding_models.py +8 -2
  210. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +625 -0
  211. mteb/models/model_implementations/seed_models.py +1 -0
  212. mteb/models/model_implementations/sentence_transformers_models.py +177 -18
  213. mteb/models/model_implementations/shuu_model.py +32 -31
  214. mteb/models/model_implementations/siglip_models.py +30 -20
  215. mteb/models/model_implementations/slm_models.py +416 -0
  216. mteb/models/model_implementations/sonar_models.py +1 -0
  217. mteb/models/model_implementations/spartan8806_atles_champion.py +34 -0
  218. mteb/models/model_implementations/stella_models.py +23 -4
  219. mteb/models/model_implementations/tarka_models.py +376 -0
  220. mteb/models/model_implementations/text2vec_models.py +9 -3
  221. mteb/models/model_implementations/ua_sentence_models.py +11 -1
  222. mteb/models/model_implementations/uae_models.py +8 -1
  223. mteb/models/model_implementations/vdr_models.py +3 -1
  224. mteb/models/model_implementations/vi_vn_models.py +45 -6
  225. mteb/models/model_implementations/vista_models.py +2 -0
  226. mteb/models/model_implementations/vlm2vec_models.py +5 -3
  227. mteb/models/model_implementations/voyage_models.py +99 -0
  228. mteb/models/model_implementations/voyage_v.py +17 -9
  229. mteb/models/model_implementations/xyz_models.py +1 -0
  230. mteb/models/model_implementations/youtu_models.py +2 -1
  231. mteb/models/model_implementations/yuan_models.py +34 -0
  232. mteb/models/model_implementations/yuan_models_en.py +58 -0
  233. mteb/models/model_meta.py +498 -29
  234. mteb/models/models_protocols.py +22 -6
  235. mteb/models/search_encoder_index/__init__.py +7 -0
  236. mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
  237. mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
  238. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +160 -0
  239. mteb/models/search_wrappers.py +197 -65
  240. mteb/models/sentence_transformer_wrapper.py +52 -32
  241. mteb/models/vllm_wrapper.py +327 -0
  242. mteb/py.typed +0 -0
  243. mteb/results/benchmark_results.py +114 -65
  244. mteb/results/model_result.py +63 -26
  245. mteb/results/task_result.py +117 -77
  246. mteb/similarity_functions.py +60 -7
  247. mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
  248. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
  249. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
  250. mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +47 -5
  251. mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py +2 -6
  252. mteb/tasks/classification/ara/ajgt.py +1 -2
  253. mteb/tasks/classification/ara/hotel_review_sentiment_classification.py +1 -2
  254. mteb/tasks/classification/ara/online_store_review_sentiment_classification.py +1 -2
  255. mteb/tasks/classification/ara/restaurant_review_sentiment_classification.py +1 -2
  256. mteb/tasks/classification/ara/tweet_emotion_classification.py +1 -2
  257. mteb/tasks/classification/ara/tweet_sarcasm_classification.py +1 -2
  258. mteb/tasks/classification/ben/bengali_document_classification.py +1 -2
  259. mteb/tasks/classification/ben/bengali_hate_speech_classification.py +1 -2
  260. mteb/tasks/classification/ben/bengali_sentiment_analysis.py +1 -2
  261. mteb/tasks/classification/ces/csfdcz_movie_review_sentiment_classification.py +1 -2
  262. mteb/tasks/classification/ces/czech_product_review_sentiment_classification.py +1 -2
  263. mteb/tasks/classification/ces/czech_so_me_sentiment_classification.py +1 -2
  264. mteb/tasks/classification/dan/angry_tweets_classification.py +1 -2
  265. mteb/tasks/classification/dan/danish_political_comments_classification.py +1 -2
  266. mteb/tasks/classification/dan/ddisco_cohesion_classification.py +1 -2
  267. mteb/tasks/classification/dan/dk_hate_classification.py +2 -3
  268. mteb/tasks/classification/deu/german_politicians_twitter_sentiment_classification.py +1 -2
  269. mteb/tasks/classification/deu/ten_k_gnad_classification.py +1 -2
  270. mteb/tasks/classification/eng/amazon_polarity_classification.py +1 -2
  271. mteb/tasks/classification/eng/arxiv_classification.py +1 -2
  272. mteb/tasks/classification/eng/banking77_classification.py +1 -2
  273. mteb/tasks/classification/eng/dbpedia_classification.py +1 -2
  274. mteb/tasks/classification/eng/emotion_classification.py +1 -2
  275. mteb/tasks/classification/eng/financial_phrasebank_classification.py +1 -2
  276. mteb/tasks/classification/eng/frenk_en_classification.py +1 -2
  277. mteb/tasks/classification/eng/gtsrb_classification.py +1 -1
  278. mteb/tasks/classification/eng/imdb_classification.py +1 -2
  279. mteb/tasks/classification/eng/legal_bench_classification.py +14 -120
  280. mteb/tasks/classification/eng/news_classification.py +1 -2
  281. mteb/tasks/classification/eng/patch_camelyon_classification.py +1 -1
  282. mteb/tasks/classification/eng/patent_classification.py +1 -2
  283. mteb/tasks/classification/eng/poem_sentiment_classification.py +1 -2
  284. mteb/tasks/classification/eng/sds_eye_protection_classification.py +1 -2
  285. mteb/tasks/classification/eng/sds_gloves_classification.py +1 -2
  286. mteb/tasks/classification/eng/toxic_chat_classification.py +2 -19
  287. mteb/tasks/classification/eng/toxic_conversations_classification.py +1 -2
  288. mteb/tasks/classification/eng/tweet_sentiment_extraction_classification.py +1 -2
  289. mteb/tasks/classification/eng/tweet_topic_single_classification.py +2 -13
  290. mteb/tasks/classification/eng/ucf101_classification.py +1 -5
  291. mteb/tasks/classification/eng/wikipedia_bio_met_chem_classification.py +1 -2
  292. mteb/tasks/classification/eng/wikipedia_chem_fields_classification.py +1 -2
  293. mteb/tasks/classification/eng/wikipedia_comp_chem_spectroscopy_classification.py +1 -2
  294. mteb/tasks/classification/eng/wikipedia_crystallography_analytical_classification.py +1 -2
  295. mteb/tasks/classification/eng/wikipedia_theoretical_applied_classification.py +1 -2
  296. mteb/tasks/classification/eng/yahoo_answers_topics_classification.py +1 -2
  297. mteb/tasks/classification/eng/yelp_review_full_classification.py +1 -2
  298. mteb/tasks/classification/est/estonian_valence.py +2 -3
  299. mteb/tasks/classification/fas/fa_mteb_classification.py +7 -14
  300. mteb/tasks/classification/fil/filipino_hate_speech_classification.py +1 -2
  301. mteb/tasks/classification/fin/fin_toxicity_classification.py +2 -11
  302. mteb/tasks/classification/fra/french_book_reviews.py +1 -2
  303. mteb/tasks/classification/fra/movie_review_sentiment_classification.py +1 -2
  304. mteb/tasks/classification/guj/gujarati_news_classification.py +1 -2
  305. mteb/tasks/classification/heb/__init__.py +6 -1
  306. mteb/tasks/classification/heb/hebrew_sentiment_analysis.py +62 -4
  307. mteb/tasks/classification/hin/hindi_discourse_classification.py +1 -2
  308. mteb/tasks/classification/hin/sentiment_analysis_hindi.py +1 -2
  309. mteb/tasks/classification/hrv/frenk_hr_classification.py +1 -2
  310. mteb/tasks/classification/ind/indonesian_id_clickbait_classification.py +1 -2
  311. mteb/tasks/classification/ind/indonesian_mongabay_conservation_classification.py +1 -2
  312. mteb/tasks/classification/ita/italian_linguist_acceptability_classification.py +1 -2
  313. mteb/tasks/classification/jav/javanese_imdb_classification.py +1 -2
  314. mteb/tasks/classification/jpn/wrime_classification.py +1 -2
  315. mteb/tasks/classification/kan/kannada_news_classification.py +1 -2
  316. mteb/tasks/classification/kor/klue_tc.py +1 -2
  317. mteb/tasks/classification/kor/kor_hate_classification.py +2 -17
  318. mteb/tasks/classification/kor/kor_sarcasm_classification.py +2 -19
  319. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +3 -4
  320. mteb/tasks/classification/mal/malayalam_news_classification.py +1 -2
  321. mteb/tasks/classification/mar/marathi_news_classification.py +1 -2
  322. mteb/tasks/classification/mkd/macedonian_tweet_sentiment_classification.py +1 -2
  323. mteb/tasks/classification/multilingual/catalonia_tweet_classification.py +1 -6
  324. mteb/tasks/classification/multilingual/multi_hate_classification.py +1 -4
  325. mteb/tasks/classification/multilingual/ru_sci_bench_classification.py +4 -23
  326. mteb/tasks/classification/multilingual/scala_classification.py +2 -3
  327. mteb/tasks/classification/multilingual/sib200_classification.py +1 -6
  328. mteb/tasks/classification/mya/myanmar_news.py +1 -2
  329. mteb/tasks/classification/nep/nepali_news_classification.py +1 -2
  330. mteb/tasks/classification/nld/dutch_book_review_sentiment_classification.py +4 -2
  331. mteb/tasks/classification/nld/dutch_cola_classification.py +3 -0
  332. mteb/tasks/classification/nld/dutch_government_bias_classification.py +3 -0
  333. mteb/tasks/classification/nld/dutch_news_articles_classification.py +3 -0
  334. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +3 -0
  335. mteb/tasks/classification/nld/iconclass_classification.py +3 -0
  336. mteb/tasks/classification/nld/open_tender_classification.py +3 -0
  337. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +3 -0
  338. mteb/tasks/classification/nob/no_rec_classification.py +1 -2
  339. mteb/tasks/classification/nob/norwegian_parliament_classification.py +1 -2
  340. mteb/tasks/classification/ory/odia_news_classification.py +1 -2
  341. mteb/tasks/classification/pol/polish_classification.py +3 -6
  342. mteb/tasks/classification/ron/moroco.py +1 -2
  343. mteb/tasks/classification/ron/romanian_reviews_sentiment.py +1 -2
  344. mteb/tasks/classification/ron/romanian_sentiment_classification.py +1 -2
  345. mteb/tasks/classification/rus/georeview_classification.py +1 -2
  346. mteb/tasks/classification/rus/headline_classification.py +1 -2
  347. mteb/tasks/classification/rus/inappropriateness_classification.py +1 -2
  348. mteb/tasks/classification/rus/ru_reviews_classification.py +1 -2
  349. mteb/tasks/classification/rus/ru_toixic_classification_okmlcup.py +1 -2
  350. mteb/tasks/classification/rus/senti_ru_eval.py +1 -2
  351. mteb/tasks/classification/sin/sinhala_news_classification.py +1 -2
  352. mteb/tasks/classification/sin/sinhala_news_source_classification.py +1 -2
  353. mteb/tasks/classification/slk/csfdsk_movie_review_sentiment_classification.py +1 -2
  354. mteb/tasks/classification/slk/slovak_hate_speech_classification.py +1 -2
  355. mteb/tasks/classification/slk/slovak_movie_review_sentiment_classification.py +1 -2
  356. mteb/tasks/classification/slv/frenk_sl_classification.py +1 -2
  357. mteb/tasks/classification/spa/spanish_news_classification.py +1 -2
  358. mteb/tasks/classification/spa/spanish_sentiment_classification.py +1 -2
  359. mteb/tasks/classification/ssw/siswati_news_classification.py +1 -2
  360. mteb/tasks/classification/swa/swahili_news_classification.py +1 -2
  361. mteb/tasks/classification/swe/dalaj_classification.py +1 -2
  362. mteb/tasks/classification/swe/swe_rec_classification.py +1 -2
  363. mteb/tasks/classification/swe/swedish_sentiment_classification.py +1 -2
  364. mteb/tasks/classification/tam/tamil_news_classification.py +1 -2
  365. mteb/tasks/classification/tel/telugu_andhra_jyoti_news_classification.py +1 -2
  366. mteb/tasks/classification/tha/wisesight_sentiment_classification.py +1 -2
  367. mteb/tasks/classification/tsn/tswana_news_classification.py +1 -2
  368. mteb/tasks/classification/tur/__init__.py +4 -0
  369. mteb/tasks/classification/tur/turkish_constitutional_court.py +41 -0
  370. mteb/tasks/classification/tur/turkish_movie_sentiment_classification.py +1 -2
  371. mteb/tasks/classification/tur/turkish_product_sentiment_classification.py +1 -2
  372. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -15
  373. mteb/tasks/classification/urd/urdu_roman_sentiment_classification.py +1 -2
  374. mteb/tasks/classification/vie/amazon_counterfactual_vn_classification.py +1 -6
  375. mteb/tasks/classification/vie/amazon_polarity_vn_classification.py +1 -6
  376. mteb/tasks/classification/vie/amazon_reviews_vn_classification.py +1 -5
  377. mteb/tasks/classification/vie/banking77_vn_classification.py +1 -5
  378. mteb/tasks/classification/vie/emotion_vn_classification.py +1 -5
  379. mteb/tasks/classification/vie/imdb_vn_classification.py +1 -5
  380. mteb/tasks/classification/vie/massive_intent_vn_classification.py +1 -5
  381. mteb/tasks/classification/vie/massive_scenario_vn_classification.py +1 -5
  382. mteb/tasks/classification/vie/mtop_domain_vn_classification.py +1 -5
  383. mteb/tasks/classification/vie/mtop_intent_vn_classification.py +1 -5
  384. mteb/tasks/classification/vie/toxic_conversations_vn_classification.py +1 -5
  385. mteb/tasks/classification/vie/tweet_sentiment_extraction_vn_classification.py +1 -5
  386. mteb/tasks/classification/vie/vie_student_feedback_classification.py +1 -2
  387. mteb/tasks/classification/zho/cmteb_classification.py +5 -10
  388. mteb/tasks/classification/zho/yue_openrice_review_classification.py +1 -2
  389. mteb/tasks/classification/zul/isi_zulu_news_classification.py +1 -2
  390. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  391. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  392. mteb/tasks/clustering/jpn/mews_c16_ja_clustering.py +1 -3
  393. mteb/tasks/clustering/multilingual/sib200_clustering_s2s.py +1 -6
  394. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +3 -0
  395. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +3 -0
  396. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +3 -0
  397. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +3 -0
  398. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +3 -0
  399. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +3 -0
  400. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +3 -0
  401. mteb/tasks/clustering/vie/reddit_clustering_p2p_vn.py +1 -5
  402. mteb/tasks/clustering/vie/reddit_clustering_vn.py +1 -5
  403. mteb/tasks/clustering/vie/stack_exchange_clustering_p2p_vn.py +1 -5
  404. mteb/tasks/clustering/vie/stack_exchange_clustering_vn.py +1 -5
  405. mteb/tasks/clustering/vie/twenty_newsgroups_clustering_vn.py +1 -5
  406. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  407. mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
  408. mteb/tasks/multilabel_classification/ita/emit_classification.py +1 -5
  409. mteb/tasks/multilabel_classification/kor/kor_hate_speech_ml_classification.py +1 -9
  410. mteb/tasks/multilabel_classification/mlt/maltese_news_classification.py +1 -6
  411. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +3 -0
  412. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +3 -0
  413. mteb/tasks/multilabel_classification/por/brazilian_toxic_tweets_classification.py +1 -6
  414. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_group_classification.py +1 -1
  415. mteb/tasks/multilabel_classification/swe/swedish_patent_cpc_subclass_classification.py +1 -2
  416. mteb/tasks/pair_classification/dan/talemaader_pc.py +1 -6
  417. mteb/tasks/pair_classification/eng/legal_bench_pc.py +1 -9
  418. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +3 -0
  419. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +3 -0
  420. mteb/tasks/pair_classification/rus/__init__.py +2 -2
  421. mteb/tasks/pair_classification/rus/terra.py +51 -25
  422. mteb/tasks/pair_classification/vie/sprint_duplicate_questions_pcvn.py +1 -5
  423. mteb/tasks/pair_classification/vie/twitter_sem_eval2015_pcvn.py +1 -5
  424. mteb/tasks/pair_classification/vie/twitter_url_corpus_pcvn.py +1 -5
  425. mteb/tasks/regression/multilingual/ru_sci_bench_regression.py +2 -6
  426. mteb/tasks/reranking/jpn/__init__.py +9 -1
  427. mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
  428. mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
  429. mteb/tasks/reranking/multilingual/__init__.py +2 -0
  430. mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
  431. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  432. mteb/tasks/reranking/multilingual/x_glue_wpr_reranking.py +1 -2
  433. mteb/tasks/reranking/vie/ask_ubuntu_dup_questions_vn.py +1 -5
  434. mteb/tasks/reranking/vie/sci_docs_reranking_vn.py +1 -5
  435. mteb/tasks/reranking/vie/stack_overflow_dup_questions_vn.py +1 -5
  436. mteb/tasks/retrieval/code/code_rag.py +12 -12
  437. mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
  438. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  439. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  440. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  441. mteb/tasks/retrieval/eng/__init__.py +2 -0
  442. mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
  443. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  444. mteb/tasks/retrieval/eng/lit_search_retrieval.py +1 -8
  445. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
  446. mteb/tasks/retrieval/jpn/__init__.py +8 -0
  447. mteb/tasks/retrieval/jpn/ja_cwir_retrieval.py +1 -4
  448. mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
  449. mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
  450. mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
  451. mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
  452. mteb/tasks/retrieval/kat/georgian_faq_retrieval.py +11 -4
  453. mteb/tasks/retrieval/kor/__init__.py +16 -1
  454. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  455. mteb/tasks/retrieval/kor/squad_kor_v1_retrieval.py +47 -0
  456. mteb/tasks/retrieval/multilingual/__init__.py +24 -0
  457. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +5 -4
  458. mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
  459. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
  460. mteb/tasks/retrieval/multilingual/mkqa_retrieval.py +1 -2
  461. mteb/tasks/retrieval/multilingual/mlqa_retrieval.py +1 -4
  462. mteb/tasks/retrieval/multilingual/multi_long_doc_retrieval.py +1 -2
  463. mteb/tasks/retrieval/multilingual/public_health_qa_retrieval.py +9 -4
  464. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +2 -12
  465. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +4 -2
  466. mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +389 -0
  467. mteb/tasks/retrieval/nld/__init__.py +8 -4
  468. mteb/tasks/retrieval/nld/argu_ana_nl_retrieval.py +46 -27
  469. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +3 -0
  470. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +3 -0
  471. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +3 -0
  472. mteb/tasks/retrieval/nld/nf_corpus_nl_retrieval.py +42 -25
  473. mteb/tasks/retrieval/nld/open_tender_retrieval.py +3 -0
  474. mteb/tasks/retrieval/nld/sci_fact_nl_retrieval.py +42 -24
  475. mteb/tasks/retrieval/nld/scidocsnl_retrieval.py +44 -27
  476. mteb/tasks/retrieval/nld/vabb_retrieval.py +3 -0
  477. mteb/tasks/retrieval/nob/norquad.py +2 -2
  478. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  479. mteb/tasks/retrieval/slk/slovak_sum_retrieval.py +1 -7
  480. mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
  481. mteb/tasks/retrieval/vie/__init__.py +14 -6
  482. mteb/tasks/retrieval/vie/argu_ana_vn_retrieval.py +1 -5
  483. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +40 -5
  484. mteb/tasks/retrieval/vie/cqa_dupstack_android_vn_retrieval.py +1 -5
  485. mteb/tasks/retrieval/vie/cqa_dupstack_gis_vn_retrieval.py +1 -5
  486. mteb/tasks/retrieval/vie/cqa_dupstack_mathematica_vn_retrieval.py +1 -5
  487. mteb/tasks/retrieval/vie/cqa_dupstack_physics_vn_retrieval.py +1 -5
  488. mteb/tasks/retrieval/vie/cqa_dupstack_programmers_vn_retrieval.py +1 -5
  489. mteb/tasks/retrieval/vie/cqa_dupstack_stats_vn_retrieval.py +1 -5
  490. mteb/tasks/retrieval/vie/cqa_dupstack_tex_vn_retrieval.py +1 -5
  491. mteb/tasks/retrieval/vie/cqa_dupstack_unix_vn_retrieval.py +1 -5
  492. mteb/tasks/retrieval/vie/cqa_dupstack_webmasters_vn_retrieval.py +1 -5
  493. mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py +1 -5
  494. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +40 -5
  495. mteb/tasks/retrieval/vie/fevervn_retrieval.py +40 -7
  496. mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py +1 -5
  497. mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
  498. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +40 -6
  499. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +49 -5
  500. mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py +1 -5
  501. mteb/tasks/retrieval/vie/nqvn_retrieval.py +40 -5
  502. mteb/tasks/retrieval/vie/quora_vn_retrieval.py +1 -6
  503. mteb/tasks/retrieval/vie/sci_fact_vn_retrieval.py +1 -5
  504. mteb/tasks/retrieval/vie/scidocsvn_retrieval.py +1 -6
  505. mteb/tasks/retrieval/vie/touche2020_vn_retrieval.py +1 -5
  506. mteb/tasks/retrieval/vie/treccovidvn_retrieval.py +1 -5
  507. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  508. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  509. mteb/tasks/sts/nld/sick_nl_sts.py +1 -0
  510. mteb/tasks/sts/vie/biosses_stsvn.py +1 -5
  511. mteb/tasks/sts/vie/sickr_stsvn.py +1 -5
  512. mteb/tasks/sts/vie/sts_benchmark_stsvn.py +1 -5
  513. mteb/tasks/zeroshot_classification/eng/gtsrb.py +1 -1
  514. mteb/tasks/zeroshot_classification/eng/patch_camelyon.py +1 -1
  515. mteb/tasks/zeroshot_classification/eng/ucf101.py +1 -5
  516. mteb/types/__init__.py +2 -0
  517. mteb/types/_encoder_io.py +19 -2
  518. mteb/types/_result.py +2 -1
  519. mteb/types/statistics.py +9 -3
  520. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/METADATA +25 -8
  521. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/RECORD +525 -438
  522. mteb/models/model_implementations/mxbai_models.py +0 -102
  523. mteb/models/model_implementations/nb_sbert.py +0 -25
  524. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
  525. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
  526. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
  527. {mteb-2.1.4.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,9 @@
1
1
  from typing import Any
2
2
 
3
3
  import torch
4
- from PIL import Image
4
+ from packaging.version import Version
5
5
  from torch.utils.data import DataLoader
6
+ from transformers import __version__ as transformers_version
6
7
 
7
8
  from mteb.abstasks.task_metadata import TaskMetadata
8
9
  from mteb.models.abs_encoder import AbsEncoder
@@ -31,6 +32,14 @@ class LlamaNemoretrieverColembed(AbsEncoder):
31
32
  attn_implementation="flash_attention_2",
32
33
  **kwargs,
33
34
  ):
35
+ required_transformers_version = "4.49.0"
36
+
37
+ if Version(transformers_version) != Version(required_transformers_version):
38
+ raise RuntimeError(
39
+ f"transformers version {transformers_version} is not match with required "
40
+ f"install version {required_transformers_version} to run `nvidia/llama-nemoretriever-colembed`"
41
+ )
42
+
34
43
  from transformers import AutoModel
35
44
 
36
45
  self.model = AutoModel.from_pretrained(
@@ -53,6 +62,7 @@ class LlamaNemoretrieverColembed(AbsEncoder):
53
62
  **kwargs,
54
63
  ):
55
64
  import torchvision.transforms.functional as F
65
+ from PIL import Image
56
66
 
57
67
  all_images = []
58
68
  if isinstance(images, DataLoader):
@@ -61,14 +71,16 @@ class LlamaNemoretrieverColembed(AbsEncoder):
61
71
  iterator = DataLoader(images, batch_size=batch_size)
62
72
 
63
73
  for batch in iterator:
64
- for b in batch:
74
+ for image in batch["image"]:
65
75
  pil_img = (
66
- F.to_pil_image(b.to("cpu")) if not isinstance(b, Image.Image) else b
76
+ image
77
+ if isinstance(image, Image.Image)
78
+ else F.to_pil_image(image.to("cpu"))
67
79
  )
68
80
  all_images.append(pil_img)
69
81
 
70
82
  batch_size = 1
71
- return self.model.forward_passages(all_images, batch_size=batch_size)
83
+ return self.model.forward_images(all_images, batch_size=batch_size)
72
84
 
73
85
  def calculate_probs(self, text_embeddings, image_embeddings):
74
86
  scores = self.similarity(text_embeddings, image_embeddings)
@@ -117,19 +129,18 @@ class LlamaNemoretrieverColembed(AbsEncoder):
117
129
 
118
130
  TRAINING_DATA = {
119
131
  # from https://huggingface.co/datasets/vidore/colpali_train_set
120
- "DocVQA",
121
- "InfoVQA",
122
- "TATDQA",
123
- "arXivQA",
124
- "hotpotqa",
125
- "miracl",
132
+ "VidoreDocVQARetrieval",
133
+ "VidoreInfoVQARetrieval",
134
+ "VidoreTatdqaRetrieval",
135
+ "VidoreArxivQARetrieval",
136
+ "HotpotQA",
137
+ "MIRACLRetrieval",
126
138
  "NQ",
127
- "stackexchange",
139
+ "StackExchangeClustering",
128
140
  "SQuAD",
129
141
  "WebInstructSub",
130
142
  "docmatix-ir",
131
- "vdr-multilingual-train",
132
- "colpali_train_set", # as it contains PDFs
143
+ "VDRMultilingualRetrieval",
133
144
  "VisRAG-Ret-Train-Synthetic-data",
134
145
  "VisRAG-Ret-Train-In-domain-data",
135
146
  "wiki-ss-nq",
@@ -141,19 +152,20 @@ llama_nemoretriever_colembed_1b_v1 = ModelMeta(
141
152
  trust_remote_code=True,
142
153
  ),
143
154
  name="nvidia/llama-nemoretriever-colembed-1b-v1",
155
+ model_type=["late-interaction"],
144
156
  languages=["eng-Latn"],
145
- revision="1f0fdea7f5b19532a750be109b19072d719b8177",
157
+ revision="6eade800103413033f260bb55b49fe039fd28a6e",
146
158
  release_date="2025-06-27",
147
159
  modalities=["image", "text"],
148
160
  n_parameters=2_418_000_000,
149
- memory_usage_mb=9224,
161
+ memory_usage_mb=4610,
150
162
  max_tokens=8192,
151
163
  embed_dim=2048,
152
164
  license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE",
153
165
  open_weights=True,
154
166
  public_training_code="Proprietary Code",
155
167
  public_training_data="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1#training-dataset",
156
- framework=["PyTorch"],
168
+ framework=["PyTorch", "Transformers", "safetensors"],
157
169
  reference="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1",
158
170
  similarity_fn_name="MaxSim",
159
171
  use_instructions=True,
@@ -167,19 +179,20 @@ llama_nemoretriever_colembed_3b_v1 = ModelMeta(
167
179
  trust_remote_code=True,
168
180
  ),
169
181
  name="nvidia/llama-nemoretriever-colembed-3b-v1",
182
+ model_type=["late-interaction"],
170
183
  languages=["eng-Latn"],
171
- revision="50c36f4d5271c6851aa08bd26d69f6e7ca8b870c",
184
+ revision="4194bdd2cd2871f220ddba6273ce173ef1217a1e",
172
185
  release_date="2025-06-27",
173
186
  modalities=["image", "text"],
174
187
  n_parameters=4_407_000_000,
175
- memory_usage_mb=16811,
188
+ memory_usage_mb=8403,
176
189
  max_tokens=8192,
177
190
  embed_dim=3072,
178
191
  license="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1/blob/main/LICENSE",
179
192
  open_weights=True,
180
193
  public_training_code="Proprietary Code",
181
194
  public_training_data="https://huggingface.co/nvidia/llama-nemoretriever-colembed-1b-v1#training-dataset",
182
- framework=["PyTorch"],
195
+ framework=["PyTorch", "Transformers", "safetensors"],
183
196
  reference="https://huggingface.co/nvidia/llama-nemoretriever-colembed-3b-v1",
184
197
  similarity_fn_name="MaxSim",
185
198
  use_instructions=True,
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from collections.abc import Callable
2
3
  from typing import Any
3
4
 
4
5
  import torch
@@ -9,8 +10,9 @@ from tqdm import tqdm
9
10
  from transformers import AutoModel, AutoTokenizer
10
11
  from transformers import __version__ as transformers_version
11
12
 
12
- from mteb import TaskMetadata
13
13
  from mteb._requires_package import requires_package
14
+ from mteb.abstasks.task_metadata import TaskMetadata
15
+ from mteb.models import CrossEncoderWrapper
14
16
  from mteb.models.abs_encoder import AbsEncoder
15
17
  from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
16
18
  from mteb.models.model_meta import ModelMeta, ScoringFunction
@@ -18,18 +20,28 @@ from mteb.types import Array, BatchedInput, PromptType
18
20
 
19
21
  logger = logging.getLogger(__name__)
20
22
 
21
- NV_RETRIEVER_CITATION = """@misc{moreira2025nvretrieverimprovingtextembedding,
22
- title={NV-Retriever: Improving text embedding models with effective hard-negative mining},
23
- author={Gabriel de Souza P. Moreira and Radek Osmulski and Mengyao Xu and Ronay Ak and Benedikt Schifferer and Even Oldridge},
23
+ NV_RETRIEVER_CITATION = """@misc{lee2025nvembedimprovedtechniquestraining,
24
+ title={NV-Embed: Improved Techniques for Training LLMs as Generalist Embedding Models},
25
+ author={Chankyu Lee and Rajarshi Roy and Mengyao Xu and Jonathan Raiman and Mohammad Shoeybi and Bryan Catanzaro and Wei Ping},
24
26
  year={2025},
25
- eprint={2407.15831},
27
+ eprint={2405.17428},
26
28
  archivePrefix={arXiv},
27
- primaryClass={cs.IR},
28
- url={https://arxiv.org/abs/2407.15831}
29
+ primaryClass={cs.CL},
30
+ url={https://arxiv.org/abs/2405.17428},
31
+ }"""
32
+
33
+ LlamaEmbedNemotron_CITATION = """@misc{babakhin2025llamaembednemotron8buniversaltextembedding,
34
+ title={Llama-Embed-Nemotron-8B: A Universal Text Embedding Model for Multilingual and Cross-Lingual Tasks},
35
+ author={Yauhen Babakhin and Radek Osmulski and Ronay Ak and Gabriel Moreira and Mengyao Xu and Benedikt Schifferer and Bo Liu and Even Oldridge},
36
+ year={2025},
37
+ eprint={2511.07025},
38
+ archivePrefix={arXiv},
39
+ primaryClass={cs.CL},
40
+ url={https://arxiv.org/abs/2511.07025},
29
41
  }"""
30
42
 
31
43
 
32
- def instruction_template(
44
+ def _instruction_template(
33
45
  instruction: str, prompt_type: PromptType | None = None
34
46
  ) -> str:
35
47
  return f"Instruct: {instruction}\nQuery: " if instruction else ""
@@ -100,10 +112,77 @@ nvidia_training_datasets = {
100
112
  "MrTidyRetrieval",
101
113
  }
102
114
 
115
+
116
+ class _NVEmbedWrapper(InstructSentenceTransformerModel):
117
+ """Inherited, because nvembed requires `sbert==2`, but it doesn't have tokenizers kwargs"""
118
+
119
+ def __init__(
120
+ self,
121
+ model_name: str,
122
+ revision: str,
123
+ instruction_template: str
124
+ | Callable[[str, PromptType | None], str]
125
+ | None = None,
126
+ max_seq_length: int | None = None,
127
+ apply_instruction_to_passages: bool = True,
128
+ padding_side: str | None = None,
129
+ add_eos_token: bool = False,
130
+ prompts_dict: dict[str, str] | None = None,
131
+ **kwargs: Any,
132
+ ):
133
+ from sentence_transformers import __version__ as sbert_version
134
+
135
+ required_transformers_version = "4.42.4"
136
+ required_sbert_version = "2.7.0"
137
+
138
+ if Version(transformers_version) != Version(required_transformers_version):
139
+ raise RuntimeError(
140
+ f"transformers version {transformers_version} is not match with required "
141
+ f"install version {required_transformers_version} to run `nvidia/NV-Embed-v2`"
142
+ )
143
+
144
+ if Version(sbert_version) != Version(required_sbert_version):
145
+ raise RuntimeError(
146
+ f"sbert version {sbert_version} is not match with required "
147
+ f"install version {required_sbert_version} to run `nvidia/NV-Embed-v2`"
148
+ )
149
+
150
+ requires_package(
151
+ self, "flash_attn", model_name, "pip install 'mteb[flash_attention]'"
152
+ )
153
+
154
+ from sentence_transformers import SentenceTransformer
155
+
156
+ if (
157
+ isinstance(instruction_template, str)
158
+ and "{instruction}" not in instruction_template
159
+ ):
160
+ raise ValueError(
161
+ "Instruction template must contain the string '{instruction}'."
162
+ )
163
+ if instruction_template is None:
164
+ logger.warning(
165
+ "No instruction template provided. Instructions will be used as-is."
166
+ )
167
+
168
+ self.instruction_template = instruction_template
169
+
170
+ self.model_name = model_name
171
+ self.model = SentenceTransformer(model_name, revision=revision, **kwargs)
172
+ self.model.tokenizer.padding_side = padding_side
173
+ self.model.tokenizer.add_eos_token = add_eos_token
174
+
175
+ if max_seq_length:
176
+ # https://github.com/huggingface/sentence-transformers/issues/3575
177
+ self.model.max_seq_length = max_seq_length
178
+ self.apply_instruction_to_passages = apply_instruction_to_passages
179
+ self.prompts_dict = prompts_dict
180
+
181
+
103
182
  NV_embed_v2 = ModelMeta(
104
- loader=InstructSentenceTransformerModel,
183
+ loader=_NVEmbedWrapper,
105
184
  loader_kwargs=dict(
106
- instruction_template=instruction_template,
185
+ instruction_template=_instruction_template,
107
186
  trust_remote_code=True,
108
187
  max_seq_length=32768,
109
188
  padding_side="right",
@@ -111,6 +190,7 @@ NV_embed_v2 = ModelMeta(
111
190
  add_eos_token=True,
112
191
  ),
113
192
  name="nvidia/NV-Embed-v2",
193
+ model_type=["dense"],
114
194
  languages=["eng-Latn"],
115
195
  open_weights=True,
116
196
  revision="7604d305b621f14095a1aa23d351674c2859553a",
@@ -122,7 +202,7 @@ NV_embed_v2 = ModelMeta(
122
202
  max_tokens=32768,
123
203
  reference="https://huggingface.co/nvidia/NV-Embed-v2",
124
204
  similarity_fn_name=ScoringFunction.COSINE,
125
- framework=["Sentence Transformers", "PyTorch"],
205
+ framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
126
206
  use_instructions=True,
127
207
  training_datasets=nvidia_training_datasets,
128
208
  public_training_code=None,
@@ -131,9 +211,9 @@ NV_embed_v2 = ModelMeta(
131
211
  )
132
212
 
133
213
  NV_embed_v1 = ModelMeta(
134
- loader=InstructSentenceTransformerModel,
214
+ loader=_NVEmbedWrapper,
135
215
  loader_kwargs=dict(
136
- instruction_template=instruction_template,
216
+ instruction_template=_instruction_template,
137
217
  trust_remote_code=True,
138
218
  max_seq_length=32768,
139
219
  padding_side="right",
@@ -141,18 +221,19 @@ NV_embed_v1 = ModelMeta(
141
221
  add_eos_token=True,
142
222
  ),
143
223
  name="nvidia/NV-Embed-v1",
224
+ model_type=["dense"],
144
225
  languages=["eng-Latn"],
145
226
  open_weights=True,
146
227
  revision="570834afd5fef5bf3a3c2311a2b6e0a66f6f4f2c",
147
228
  release_date="2024-09-13", # initial commit of hf model.
148
229
  n_parameters=7_850_000_000,
149
- memory_usage_mb=29945,
230
+ memory_usage_mb=14975,
150
231
  embed_dim=4096,
151
232
  license="cc-by-nc-4.0",
152
233
  max_tokens=32768,
153
234
  reference="https://huggingface.co/nvidia/NV-Embed-v1",
154
235
  similarity_fn_name=ScoringFunction.COSINE,
155
- framework=["Sentence Transformers", "PyTorch"],
236
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
156
237
  use_instructions=True,
157
238
  training_datasets=nvidia_training_datasets,
158
239
  public_training_code=None,
@@ -335,6 +416,7 @@ class LlamaEmbedNemotron(AbsEncoder):
335
416
  self,
336
417
  model_name: str,
337
418
  revision: str,
419
+ device: str | None = None,
338
420
  ) -> None:
339
421
  required_transformers_version = "4.51.0"
340
422
  if Version(transformers_version) != Version(required_transformers_version):
@@ -353,7 +435,7 @@ class LlamaEmbedNemotron(AbsEncoder):
353
435
  self.attn_implementation = (
354
436
  "flash_attention_2" if torch.cuda.is_available() else "eager"
355
437
  )
356
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
438
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
357
439
  self.task_prompts = TASK_PROMPTS
358
440
  self.instruction_template = self._instruction_template
359
441
 
@@ -528,6 +610,7 @@ class LlamaEmbedNemotron(AbsEncoder):
528
610
  llama_embed_nemotron_8b = ModelMeta(
529
611
  loader=LlamaEmbedNemotron,
530
612
  name="nvidia/llama-embed-nemotron-8b",
613
+ model_type=["dense"],
531
614
  languages=llama_embed_nemotron_evaluated_languages,
532
615
  open_weights=True,
533
616
  revision="84a375593d27d3528beb4e104822515659e093b4",
@@ -539,11 +622,63 @@ llama_embed_nemotron_8b = ModelMeta(
539
622
  max_tokens=32768,
540
623
  reference="https://huggingface.co/nvidia/llama-embed-nemotron-8b",
541
624
  similarity_fn_name="cosine",
542
- framework=["PyTorch"],
625
+ framework=["PyTorch", "Sentence Transformers", "safetensors", "Transformers"],
543
626
  use_instructions=True,
544
627
  training_datasets=llama_embed_nemotron_training_datasets,
545
- public_training_code=None, # Will be released later
546
- public_training_data=None, # Will be released later
628
+ public_training_code="https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/biencoder/llama_embed_nemotron_8b",
629
+ public_training_data="https://huggingface.co/datasets/nvidia/embed-nemotron-dataset-v1",
547
630
  contacts=["ybabakhin"],
548
- citation=NV_RETRIEVER_CITATION,
631
+ citation=LlamaEmbedNemotron_CITATION,
632
+ )
633
+
634
+
635
+ def _nemotron_rerank_model(model: str, revision: str, **kwargs) -> CrossEncoderWrapper:
636
+ required_transformers_version = "4.47.1"
637
+
638
+ if Version(transformers_version) != Version(required_transformers_version):
639
+ raise RuntimeError(
640
+ f"transformers version {transformers_version} is not match with required "
641
+ f"install version {required_transformers_version} to run `nvidia/llama-nemotron-rerank-1b-v2`"
642
+ )
643
+
644
+ return CrossEncoderWrapper(
645
+ model=model,
646
+ revision=revision,
647
+ **kwargs,
648
+ )
649
+
650
+
651
+ nemotron_rerank_1b_v2 = ModelMeta(
652
+ loader=_nemotron_rerank_model,
653
+ loader_kwargs=dict(
654
+ trust_remote_code=True,
655
+ query_prefix="question:",
656
+ passage_prefix=" \n \n passage:",
657
+ model_kwargs={"torch_dtype": torch.float32},
658
+ ),
659
+ name="nvidia/llama-nemotron-rerank-1b-v2",
660
+ revision="78efcfdc23b53a753f6c73f2d78b18132a34ac4d",
661
+ release_date="2025-10-16",
662
+ languages=["eng-Latn"],
663
+ n_parameters=1235816448,
664
+ memory_usage_mb=2357.0,
665
+ max_tokens=4096,
666
+ embed_dim=2048,
667
+ license="https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/",
668
+ open_weights=True,
669
+ public_training_code=None,
670
+ public_training_data=None,
671
+ framework=["PyTorch", "Sentence Transformers"],
672
+ reference="https://huggingface.co/nvidia/llama-nemotron-rerank-1b-v2",
673
+ similarity_fn_name=ScoringFunction.COSINE,
674
+ use_instructions=None,
675
+ training_datasets=set(
676
+ # private
677
+ ),
678
+ adapted_from="meta-llama/Llama-3.2-1B",
679
+ superseded_by=None,
680
+ modalities=["text"],
681
+ model_type=["cross-encoder"],
682
+ citation=None,
683
+ contacts=None,
549
684
  )
@@ -0,0 +1,254 @@
1
+ from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
2
+ from mteb.models.model_meta import ModelMeta
3
+ from mteb.models.models_protocols import PromptType
4
+
5
+
6
+ def instruction_template(
7
+ instruction: str, prompt_type: PromptType | None = None
8
+ ) -> str:
9
+ if (
10
+ prompt_type == PromptType.document
11
+ ): # to avoid this issue: https://huggingface.co/Qwen/Qwen3-Embedding-8B/discussions/21
12
+ return " "
13
+ if not instruction:
14
+ return ""
15
+ if isinstance(instruction, dict):
16
+ if prompt_type is None:
17
+ instruction = next(iter(instruction.values())) # TODO
18
+ else:
19
+ instruction = instruction[prompt_type]
20
+ return f"Instruct: {instruction}\nQuery:"
21
+
22
+
23
+ multilingual_langs = [
24
+ "afr-Latn",
25
+ "ara-Arab",
26
+ "aze-Latn",
27
+ "bel-Cyrl",
28
+ "bul-Cyrl",
29
+ "ben-Beng",
30
+ "cat-Latn",
31
+ "ceb-Latn",
32
+ "ces-Latn",
33
+ "cym-Latn",
34
+ "dan-Latn",
35
+ "deu-Latn",
36
+ "ell-Grek",
37
+ "eng-Latn",
38
+ "spa-Latn",
39
+ "est-Latn",
40
+ "eus-Latn",
41
+ "fas-Arab",
42
+ "fin-Latn",
43
+ "fra-Latn",
44
+ "glg-Latn",
45
+ "guj-Gujr",
46
+ "heb-Hebr",
47
+ "hin-Deva",
48
+ "hrv-Latn",
49
+ "hat-Latn",
50
+ "hun-Latn",
51
+ "hye-Armn",
52
+ "ind-Latn",
53
+ "isl-Latn",
54
+ "ita-Latn",
55
+ "jpn-Jpan",
56
+ "jav-Latn",
57
+ "kat-Geor",
58
+ "kaz-Cyrl",
59
+ "khm-Khmr",
60
+ "kan-Knda",
61
+ "kor-Hang",
62
+ "kir-Cyrl",
63
+ "lao-Laoo",
64
+ "lit-Latn",
65
+ "lav-Latn",
66
+ "mkd-Cyrl",
67
+ "mal-Mlym",
68
+ "mon-Cyrl",
69
+ "mar-Deva",
70
+ "msa-Latn",
71
+ "mya-Mymr",
72
+ "nep-Deva",
73
+ "nld-Latn",
74
+ "nor-Latn",
75
+ "nob-Latn",
76
+ "nno-Latn",
77
+ "pan-Guru",
78
+ "pol-Latn",
79
+ "por-Latn",
80
+ "que-Latn",
81
+ "ron-Latn",
82
+ "rus-Cyrl",
83
+ "sin-Sinh",
84
+ "slk-Latn",
85
+ "slv-Latn",
86
+ "swa-Latn",
87
+ "tam-Taml",
88
+ "tel-Telu",
89
+ "tha-Thai",
90
+ "tgl-Latn",
91
+ "tur-Latn",
92
+ "ukr-Cyrl",
93
+ "urd-Arab",
94
+ "vie-Latn",
95
+ "yor-Latn",
96
+ "zho-Hans",
97
+ ]
98
+
99
+ OCTEN_CITATION = """@misc{octen-embedding-2025,
100
+ title={Octen-Embedding-8B: A Fine-tuned Multilingual Text Embedding Model},
101
+ author={Octen Team},
102
+ year={2025},
103
+ url={https://huggingface.co/bflhc/bflhc/Octen-Embedding-8B}
104
+ }"""
105
+
106
+ training_data = {
107
+ "T2Retrieval",
108
+ "DuRetrieval",
109
+ "MMarcoReranking",
110
+ "CMedQAv2-reranking",
111
+ "NQ",
112
+ "MSMARCO",
113
+ "HotpotQA",
114
+ "FEVER",
115
+ "MrTidyRetrieval",
116
+ "MIRACLRetrieval",
117
+ "CodeSearchNet",
118
+ }
119
+
120
+ # Predefined prompts for various RTEB tasks
121
+ _PREDEFINED_PROMPTS = {
122
+ # ========== Open Datasets ==========
123
+ # Legal domain
124
+ "AILACasedocs": "Given a legal case scenario, retrieve the most relevant case documents",
125
+ "AILAStatutes": "Given a legal scenario, retrieve the most relevant statute documents",
126
+ "LegalQuAD": "Given a legal question, retrieve relevant legal documents that answer the question",
127
+ "LegalSummarization": "Given a query, retrieve relevant legal documents for summarization",
128
+ # Code domain
129
+ "AppsRetrieval": "Given a query about mobile applications, retrieve relevant app information",
130
+ "HumanEvalRetrieval": "Given a code problem description, retrieve relevant code examples",
131
+ "MBPPRetrieval": "Given a programming problem description, retrieve relevant code solutions",
132
+ "DS1000Retrieval": "Given a data science problem, retrieve relevant code snippets",
133
+ "FreshStackRetrieval": "Given a programming question, retrieve relevant Stack Overflow posts",
134
+ # Finance domain
135
+ "FinQARetrieval": "Given a financial question, retrieve relevant financial documents",
136
+ "FinanceBenchRetrieval": "Given a financial query, retrieve relevant financial information",
137
+ "HC3FinanceRetrieval": "Given a finance-related query, retrieve relevant documents",
138
+ # Medical domain
139
+ "CUREv1": "Given a medical query, retrieve relevant clinical documents",
140
+ "ChatDoctorRetrieval": "Given a medical question, retrieve relevant medical information",
141
+ # SQL domain
142
+ "WikiSQLRetrieval": "Given a natural language query, retrieve relevant SQL examples",
143
+ # Multilingual
144
+ "MIRACLRetrievalHardNegatives": "Given a question, retrieve Wikipedia passages that answer the question",
145
+ # ========== Private/Closed Datasets ==========
146
+ # Code domain (Private)
147
+ "Code1Retrieval": "Given a code problem description, retrieve relevant code examples",
148
+ "JapaneseCode1Retrieval": "Given a code problem description, retrieve relevant code examples",
149
+ # Finance domain (Private)
150
+ "EnglishFinance1Retrieval": "Given a financial query, retrieve relevant financial documents",
151
+ "EnglishFinance2Retrieval": "Given a financial query, retrieve relevant financial documents",
152
+ "EnglishFinance3Retrieval": "Given a financial query, retrieve relevant financial documents",
153
+ "EnglishFinance4Retrieval": "Given a financial query, retrieve relevant financial documents",
154
+ # Healthcare domain (Private)
155
+ "EnglishHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
156
+ "GermanHealthcare1Retrieval": "Given a medical question, retrieve relevant medical information",
157
+ # Legal domain (Private)
158
+ "FrenchLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
159
+ "GermanLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
160
+ "JapaneseLegal1Retrieval": "Given a legal query, retrieve relevant legal documents",
161
+ # General/Multilingual (Private)
162
+ "French1Retrieval": "Given a query, retrieve relevant passages",
163
+ "German1Retrieval": "Given a query, retrieve relevant passages",
164
+ }
165
+
166
+ Octen_Embedding_0B6 = ModelMeta(
167
+ loader=InstructSentenceTransformerModel,
168
+ loader_kwargs=dict(
169
+ instruction_template=instruction_template,
170
+ apply_instruction_to_passages=True,
171
+ prompts_dict=_PREDEFINED_PROMPTS,
172
+ max_seq_length=18480,
173
+ model_kwargs={"torch_dtype": "bfloat16"},
174
+ ),
175
+ name="bflhc/Octen-Embedding-0.6B",
176
+ languages=multilingual_langs,
177
+ open_weights=True,
178
+ revision="1a00a4e837bd788f6f8d91bc43201a5e52cf8ef8",
179
+ release_date="2026-01-10",
180
+ n_parameters=595776512,
181
+ memory_usage_mb=1136,
182
+ embed_dim=1024,
183
+ max_tokens=32768,
184
+ license="apache-2.0",
185
+ reference="https://huggingface.co/bflhc/Octen-Embedding-0.6B",
186
+ similarity_fn_name="cosine",
187
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
188
+ use_instructions=True,
189
+ public_training_code=None,
190
+ public_training_data=None,
191
+ training_datasets=training_data,
192
+ citation=OCTEN_CITATION,
193
+ adapted_from="Qwen/Qwen3-Embedding-0.6B",
194
+ )
195
+
196
+ Octen_Embedding_4B = ModelMeta(
197
+ loader=InstructSentenceTransformerModel,
198
+ loader_kwargs=dict(
199
+ instruction_template=instruction_template,
200
+ apply_instruction_to_passages=True,
201
+ prompts_dict=_PREDEFINED_PROMPTS,
202
+ max_seq_length=18480,
203
+ model_kwargs={"torch_dtype": "bfloat16"},
204
+ ),
205
+ name="bflhc/Octen-Embedding-4B",
206
+ languages=multilingual_langs,
207
+ open_weights=True,
208
+ revision="6e188e3b072c3e3678b235ad84e6e97bcbb71e8f",
209
+ release_date="2025-12-30",
210
+ n_parameters=4021774336,
211
+ memory_usage_mb=7671,
212
+ embed_dim=2560,
213
+ max_tokens=32768,
214
+ license="apache-2.0",
215
+ reference="https://huggingface.co/bflhc/Octen-Embedding-4B",
216
+ similarity_fn_name="cosine",
217
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
218
+ use_instructions=True,
219
+ public_training_code=None,
220
+ public_training_data=None,
221
+ training_datasets=training_data,
222
+ citation=OCTEN_CITATION,
223
+ adapted_from="Qwen/Qwen3-Embedding-4B",
224
+ )
225
+
226
+ Octen_Embedding_8B = ModelMeta(
227
+ loader=InstructSentenceTransformerModel,
228
+ loader_kwargs=dict(
229
+ instruction_template=instruction_template,
230
+ apply_instruction_to_passages=True,
231
+ prompts_dict=_PREDEFINED_PROMPTS,
232
+ max_seq_length=18480,
233
+ model_kwargs={"torch_dtype": "bfloat16"},
234
+ ),
235
+ name="bflhc/Octen-Embedding-8B",
236
+ languages=multilingual_langs,
237
+ open_weights=True,
238
+ revision="f7db178d5a82fb841f606a6a67c423cead2fdbba",
239
+ release_date="2025-12-23",
240
+ n_parameters=7567295488,
241
+ memory_usage_mb=14433,
242
+ embed_dim=4096,
243
+ max_tokens=32768,
244
+ license="apache-2.0",
245
+ reference="https://huggingface.co/bflhc/Octen-Embedding-8B",
246
+ similarity_fn_name="cosine",
247
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
248
+ use_instructions=True,
249
+ public_training_code=None,
250
+ public_training_data=None,
251
+ training_datasets=training_data,
252
+ citation=OCTEN_CITATION,
253
+ adapted_from="Qwen/Qwen3-Embedding-8B",
254
+ )