transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,7 @@
15
15
  # limitations under the License.
16
16
  """Tokenization classes for MPNet."""
17
17
 
18
- from typing import Optional
18
+ from typing import Optional, Union
19
19
 
20
20
  from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
21
21
  from tokenizers.models import WordPiece
@@ -38,7 +38,7 @@ class MPNetTokenizer(TokenizersBackend):
38
38
  refer to this superclass for more information regarding those methods.
39
39
 
40
40
  Args:
41
- vocab (`dict`, *optional*):
41
+ vocab (`str` or `dict[str, int]`, *optional*):
42
42
  Dictionary mapping tokens to their IDs. If not provided, an empty vocab is initialized.
43
43
  do_lower_case (`bool`, *optional*, defaults to `True`):
44
44
  Whether or not to lowercase the input when tokenizing.
@@ -87,10 +87,11 @@ class MPNetTokenizer(TokenizersBackend):
87
87
 
88
88
  vocab_files_names = VOCAB_FILES_NAMES
89
89
  model_input_names = ["input_ids", "attention_mask"]
90
+ model = WordPiece
90
91
 
91
92
  def __init__(
92
93
  self,
93
- vocab: Optional[dict] = None,
94
+ vocab: Optional[Union[str, dict[str, int]]] = None,
94
95
  do_lower_case=True,
95
96
  bos_token="<s>",
96
97
  eos_token="</s>",
@@ -104,12 +105,7 @@ class MPNetTokenizer(TokenizersBackend):
104
105
  **kwargs,
105
106
  ):
106
107
  # Initialize vocab
107
- if vocab is not None:
108
- self._vocab = (
109
- {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
110
- )
111
- else:
112
- self._vocab = {}
108
+ self._vocab = vocab if vocab is not None else {}
113
109
 
114
110
  # Initialize the tokenizer with WordPiece model
115
111
  self._tokenizer = Tokenizer(WordPiece(self._vocab, unk_token=str(unk_token)))
@@ -142,11 +138,7 @@ class MPNetTokenizer(TokenizersBackend):
142
138
  # Mask token behave like a normal word, i.e. include the space before it
143
139
  mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
144
140
 
145
- # Store for later use
146
- tokenizer_object = self._tokenizer
147
-
148
141
  super().__init__(
149
- tokenizer_object=tokenizer_object,
150
142
  do_lower_case=do_lower_case,
151
143
  bos_token=bos_token,
152
144
  eos_token=eos_token,
@@ -498,6 +498,7 @@ class MptForSequenceClassification(MptPreTrainedModel):
498
498
  output_attentions: Optional[bool] = None,
499
499
  output_hidden_states: Optional[bool] = None,
500
500
  return_dict: Optional[bool] = None,
501
+ **kwargs,
501
502
  ) -> Union[tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
502
503
  r"""
503
504
  input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
@@ -700,6 +701,7 @@ class MptForQuestionAnswering(MptPreTrainedModel):
700
701
  output_attentions: Optional[bool] = None,
701
702
  output_hidden_states: Optional[bool] = None,
702
703
  return_dict: Optional[bool] = None,
704
+ **kwargs,
703
705
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
704
706
  r"""
705
707
  input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
@@ -54,7 +54,7 @@ def load_cuda_kernels():
54
54
  global mra_cuda_kernel
55
55
  if not is_kernels_available():
56
56
  raise ImportError("kernels is not installed, please install it with `pip install kernels`")
57
- from kernels import get_kernel
57
+ from ...integrations.hub_kernels import get_kernel
58
58
 
59
59
  mra_cuda_kernel = get_kernel("kernels-community/mra")
60
60
 
@@ -796,6 +796,9 @@ class MraPreTrainedModel(PreTrainedModel):
796
796
  super()._init_weights(module)
797
797
  if isinstance(module, MraLMPredictionHead):
798
798
  init.zeros_(module.bias)
799
+ elif isinstance(module, MraEmbeddings):
800
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)) + 2)
801
+ init.zeros_(module.token_type_ids)
799
802
 
800
803
 
801
804
  @auto_docstring
@@ -826,6 +829,7 @@ class MraModel(MraPreTrainedModel):
826
829
  inputs_embeds: Optional[torch.Tensor] = None,
827
830
  output_hidden_states: Optional[bool] = None,
828
831
  return_dict: Optional[bool] = None,
832
+ **kwargs,
829
833
  ) -> Union[tuple, BaseModelOutputWithCrossAttentions]:
830
834
  output_hidden_states = (
831
835
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -919,6 +923,7 @@ class MraForMaskedLM(MraPreTrainedModel):
919
923
  labels: Optional[torch.Tensor] = None,
920
924
  output_hidden_states: Optional[bool] = None,
921
925
  return_dict: Optional[bool] = None,
926
+ **kwargs,
922
927
  ) -> Union[tuple, MaskedLMOutput]:
923
928
  r"""
924
929
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1007,6 +1012,7 @@ class MraForSequenceClassification(MraPreTrainedModel):
1007
1012
  labels: Optional[torch.Tensor] = None,
1008
1013
  output_hidden_states: Optional[bool] = None,
1009
1014
  return_dict: Optional[bool] = None,
1015
+ **kwargs,
1010
1016
  ) -> Union[tuple, SequenceClassifierOutput]:
1011
1017
  r"""
1012
1018
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1086,6 +1092,7 @@ class MraForMultipleChoice(MraPreTrainedModel):
1086
1092
  labels: Optional[torch.Tensor] = None,
1087
1093
  output_hidden_states: Optional[bool] = None,
1088
1094
  return_dict: Optional[bool] = None,
1095
+ **kwargs,
1089
1096
  ) -> Union[tuple, MultipleChoiceModelOutput]:
1090
1097
  r"""
1091
1098
  input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
@@ -1189,6 +1196,7 @@ class MraForTokenClassification(MraPreTrainedModel):
1189
1196
  labels: Optional[torch.Tensor] = None,
1190
1197
  output_hidden_states: Optional[bool] = None,
1191
1198
  return_dict: Optional[bool] = None,
1199
+ **kwargs,
1192
1200
  ) -> Union[tuple, TokenClassifierOutput]:
1193
1201
  r"""
1194
1202
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1263,6 +1271,7 @@ class MraForQuestionAnswering(MraPreTrainedModel):
1263
1271
  end_positions: Optional[torch.Tensor] = None,
1264
1272
  output_hidden_states: Optional[bool] = None,
1265
1273
  return_dict: Optional[bool] = None,
1274
+ **kwargs,
1266
1275
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
1267
1276
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1268
1277
 
@@ -133,17 +133,16 @@ class MT5Config(PreTrainedConfig):
133
133
  if feed_forward_proj == "gated-gelu":
134
134
  self.dense_act_fn = "gelu_new"
135
135
 
136
+ # Force because official weights have False serialized, but we have to tie always
137
+ kwargs["tie_word_embeddings"] = True
136
138
  super().__init__(
137
139
  is_encoder_decoder=is_encoder_decoder,
138
140
  tokenizer_class=tokenizer_class,
139
- tie_word_embeddings=tie_word_embeddings,
140
141
  pad_token_id=pad_token_id,
141
142
  eos_token_id=eos_token_id,
142
143
  decoder_start_token_id=decoder_start_token_id,
143
144
  **kwargs,
144
145
  )
145
- # TODO: Mt5 never supported not tying encoder decoder so this has to be true.
146
- self.tie_encoder_decoder = True
147
146
 
148
147
 
149
148
  __all__ = ["MT5Config"]
@@ -671,6 +671,7 @@ class MT5Stack(MT5PreTrainedModel):
671
671
  output_hidden_states=None,
672
672
  return_dict=None,
673
673
  cache_position=None,
674
+ **kwargs,
674
675
  ):
675
676
  use_cache = use_cache if use_cache is not None else self.config.use_cache
676
677
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -859,12 +860,10 @@ class MT5Model(MT5PreTrainedModel):
859
860
  encoder_config = copy.deepcopy(config)
860
861
  encoder_config.is_decoder = False
861
862
  encoder_config.use_cache = False
862
- encoder_config.tie_encoder_decoder = False
863
863
  self.encoder = MT5Stack(encoder_config)
864
864
 
865
865
  decoder_config = copy.deepcopy(config)
866
866
  decoder_config.is_decoder = True
867
- decoder_config.tie_encoder_decoder = False
868
867
  decoder_config.num_layers = config.num_decoder_layers
869
868
  self.decoder = MT5Stack(decoder_config)
870
869
 
@@ -898,6 +897,7 @@ class MT5Model(MT5PreTrainedModel):
898
897
  output_hidden_states: Optional[bool] = None,
899
898
  return_dict: Optional[bool] = None,
900
899
  cache_position: Optional[torch.LongTensor] = None,
900
+ **kwargs,
901
901
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]:
902
902
  r"""
903
903
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -1041,12 +1041,10 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
1041
1041
  encoder_config = copy.deepcopy(config)
1042
1042
  encoder_config.is_decoder = False
1043
1043
  encoder_config.use_cache = False
1044
- encoder_config.tie_encoder_decoder = False
1045
1044
  self.encoder = MT5Stack(encoder_config)
1046
1045
 
1047
1046
  decoder_config = copy.deepcopy(config)
1048
1047
  decoder_config.is_decoder = True
1049
- decoder_config.tie_encoder_decoder = False
1050
1048
  decoder_config.num_layers = config.num_decoder_layers
1051
1049
  self.decoder = MT5Stack(decoder_config)
1052
1050
 
@@ -1064,7 +1062,6 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
1064
1062
  self.decoder.set_input_embeddings(new_embeddings)
1065
1063
 
1066
1064
  @auto_docstring
1067
- # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward with google-t5/->google/, T5->MT5, t5->mt5
1068
1065
  def forward(
1069
1066
  self,
1070
1067
  input_ids: Optional[torch.LongTensor] = None,
@@ -1081,6 +1078,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
1081
1078
  output_hidden_states: Optional[bool] = None,
1082
1079
  return_dict: Optional[bool] = None,
1083
1080
  cache_position: Optional[torch.LongTensor] = None,
1081
+ **kwargs,
1084
1082
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
1085
1083
  r"""
1086
1084
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -1181,9 +1179,6 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
1181
1179
 
1182
1180
  sequence_output = decoder_outputs[0]
1183
1181
 
1184
- if self.config.tie_word_embeddings:
1185
- sequence_output = sequence_output * (self.model_dim**-0.5)
1186
-
1187
1182
  lm_logits = self.lm_head(sequence_output)
1188
1183
 
1189
1184
  loss = None
@@ -1268,6 +1263,7 @@ class MT5EncoderModel(MT5PreTrainedModel):
1268
1263
  output_attentions: Optional[bool] = None,
1269
1264
  output_hidden_states: Optional[bool] = None,
1270
1265
  return_dict: Optional[bool] = None,
1266
+ **kwargs,
1271
1267
  ) -> Union[tuple[torch.FloatTensor], BaseModelOutput]:
1272
1268
  r"""
1273
1269
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -1340,6 +1336,7 @@ class MT5ForSequenceClassification(MT5PreTrainedModel):
1340
1336
  output_attentions: Optional[bool] = None,
1341
1337
  output_hidden_states: Optional[bool] = None,
1342
1338
  return_dict: Optional[bool] = None,
1339
+ **kwargs,
1343
1340
  ) -> Union[tuple, Seq2SeqSequenceClassifierOutput]:
1344
1341
  r"""
1345
1342
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -1480,6 +1477,7 @@ class MT5ForTokenClassification(MT5PreTrainedModel):
1480
1477
  output_attentions: Optional[bool] = None,
1481
1478
  output_hidden_states: Optional[bool] = None,
1482
1479
  return_dict: Optional[bool] = None,
1480
+ **kwargs,
1483
1481
  ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
1484
1482
  r"""
1485
1483
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -1545,12 +1543,10 @@ class MT5ForQuestionAnswering(MT5PreTrainedModel):
1545
1543
  encoder_config = copy.deepcopy(config)
1546
1544
  encoder_config.is_decoder = False
1547
1545
  encoder_config.use_cache = False
1548
- encoder_config.tie_encoder_decoder = False
1549
1546
  self.encoder = MT5Stack(encoder_config)
1550
1547
 
1551
1548
  decoder_config = copy.deepcopy(config)
1552
1549
  decoder_config.is_decoder = True
1553
- decoder_config.tie_encoder_decoder = False
1554
1550
  decoder_config.num_layers = config.num_decoder_layers
1555
1551
  self.decoder = MT5Stack(decoder_config)
1556
1552
 
@@ -1587,6 +1583,7 @@ class MT5ForQuestionAnswering(MT5PreTrainedModel):
1587
1583
  output_attentions: Optional[bool] = None,
1588
1584
  output_hidden_states: Optional[bool] = None,
1589
1585
  return_dict: Optional[bool] = None,
1586
+ **kwargs,
1590
1587
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqQuestionAnsweringModelOutput]:
1591
1588
  r"""
1592
1589
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -117,6 +117,7 @@ class MusicgenSinusoidalPositionalEmbedding(nn.Module):
117
117
  def __init__(self, num_positions: int, embedding_dim: int):
118
118
  super().__init__()
119
119
  self.embedding_dim = embedding_dim
120
+ self.num_positions = num_positions
120
121
  self.make_weights(num_positions, embedding_dim)
121
122
 
122
123
  def make_weights(self, num_embeddings: int, embedding_dim: int):
@@ -432,6 +433,9 @@ class MusicgenPreTrainedModel(PreTrainedModel):
432
433
  # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
433
434
  if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
434
435
  init.zeros_(module.weight[module.padding_idx])
436
+ elif isinstance(module, MusicgenSinusoidalPositionalEmbedding):
437
+ emb_weights = module.get_embedding(module.num_positions, module.embedding_dim)
438
+ init.copy_(module.weights, emb_weights)
435
439
 
436
440
 
437
441
  class MusicgenDecoder(MusicgenPreTrainedModel):
@@ -482,6 +486,7 @@ class MusicgenDecoder(MusicgenPreTrainedModel):
482
486
  output_hidden_states: Optional[bool] = None,
483
487
  return_dict: Optional[bool] = None,
484
488
  cache_position: Optional[torch.Tensor] = None,
489
+ **kwargs,
485
490
  ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
486
491
  r"""
487
492
  input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
@@ -716,6 +721,7 @@ class MusicgenModel(MusicgenPreTrainedModel):
716
721
  output_hidden_states: Optional[bool] = None,
717
722
  return_dict: Optional[bool] = None,
718
723
  cache_position: Optional[torch.Tensor] = None,
724
+ **kwargs,
719
725
  ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
720
726
  r"""
721
727
  input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
@@ -2080,7 +2086,6 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
2080
2086
  stopping_criteria: Optional[StoppingCriteriaList] = None,
2081
2087
  synced_gpus: Optional[bool] = None,
2082
2088
  streamer: Optional["BaseStreamer"] = None,
2083
- use_model_defaults: Optional[bool] = None,
2084
2089
  **kwargs,
2085
2090
  ):
2086
2091
  """
@@ -2125,11 +2130,6 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
2125
2130
  streamer (`BaseStreamer`, *optional*):
2126
2131
  Streamer object that will be used to stream the generated sequences. Generated tokens are passed
2127
2132
  through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
2128
- use_model_defaults (`bool`, *optional*):
2129
- When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
2130
- generation configuration (`model.generation_config`), as opposed to the global defaults
2131
- (`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
2132
- `True`.
2133
2133
  kwargs (`dict[str, Any]`, *optional*):
2134
2134
  Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
2135
2135
  forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
@@ -2153,9 +2153,7 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
2153
2153
  """
2154
2154
  # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
2155
2155
  generation_mode_kwargs = self._extract_generation_mode_kwargs(None, kwargs, False, None, None)
2156
- generation_config, model_kwargs = self._prepare_generation_config(
2157
- generation_config, use_model_defaults, **kwargs
2158
- )
2156
+ generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
2159
2157
  generation_mode = generation_config.get_generation_mode()
2160
2158
  if generation_mode not in [GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH]:
2161
2159
  raise ValueError(
@@ -122,6 +122,7 @@ class MusicgenMelodySinusoidalPositionalEmbedding(nn.Module):
122
122
  def __init__(self, num_positions: int, embedding_dim: int):
123
123
  super().__init__()
124
124
  self.embedding_dim = embedding_dim
125
+ self.num_positions = num_positions
125
126
  self.make_weights(num_positions, embedding_dim)
126
127
 
127
128
  def make_weights(self, num_embeddings: int, embedding_dim: int):
@@ -403,6 +404,9 @@ class MusicgenMelodyPreTrainedModel(PreTrainedModel):
403
404
  # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
404
405
  if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
405
406
  init.zeros_(module.weight[module.padding_idx])
407
+ elif isinstance(module, MusicgenMelodySinusoidalPositionalEmbedding):
408
+ emb_weights = module.get_embedding(module.num_positions, module.embedding_dim)
409
+ init.copy_(module.weights, emb_weights)
406
410
 
407
411
 
408
412
  # Copied from transformers.models.musicgen.modeling_musicgen.MusicgenDecoder with MUSICGEN->MUSICGEN_MELODY,Musicgen->MusicgenMelody
@@ -455,6 +459,7 @@ class MusicgenMelodyDecoder(MusicgenMelodyPreTrainedModel):
455
459
  output_hidden_states: Optional[bool] = None,
456
460
  return_dict: Optional[bool] = None,
457
461
  cache_position: Optional[torch.Tensor] = None,
462
+ **kwargs,
458
463
  ) -> Union[tuple, BaseModelOutputWithPast]:
459
464
  r"""
460
465
  input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
@@ -670,6 +675,7 @@ class MusicgenMelodyModel(MusicgenMelodyPreTrainedModel):
670
675
  output_hidden_states: Optional[bool] = None,
671
676
  return_dict: Optional[bool] = None,
672
677
  cache_position: Optional[torch.Tensor] = None,
678
+ **kwargs,
673
679
  ) -> Union[tuple, BaseModelOutputWithPast]:
674
680
  r"""
675
681
  input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
@@ -785,6 +791,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel, GenerationMixin):
785
791
  return_dict: Optional[bool] = None,
786
792
  labels: Optional[torch.LongTensor] = None,
787
793
  cache_position: Optional[torch.Tensor] = None,
794
+ **kwargs,
788
795
  ) -> Union[tuple, MusicgenMelodyOutputWithPast]:
789
796
  r"""
790
797
  input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
@@ -21,6 +21,7 @@ import torch
21
21
  from torch import nn
22
22
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
23
23
 
24
+ from ... import initialization as init
24
25
  from ...activations import ACT2FN
25
26
  from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
26
27
  from ...generation import GenerationMixin
@@ -469,6 +470,11 @@ class MvpPreTrainedModel(PreTrainedModel):
469
470
  base_model_prefix = "model"
470
471
  supports_gradient_checkpointing = True
471
472
 
473
+ def _init_weights(self, module):
474
+ super()._init_weights(module)
475
+ if isinstance(module, MvpForConditionalGeneration):
476
+ init.zeros_(module.final_logits_bias)
477
+
472
478
  @property
473
479
  def dummy_inputs(self):
474
480
  pad_token = self.config.pad_token_id
@@ -534,6 +540,7 @@ class MvpEncoder(MvpPreTrainedModel):
534
540
  output_attentions: Optional[bool] = None,
535
541
  output_hidden_states: Optional[bool] = None,
536
542
  return_dict: Optional[bool] = None,
543
+ **kwargs,
537
544
  ) -> Union[tuple, BaseModelOutput]:
538
545
  r"""
539
546
  Args:
@@ -698,6 +705,7 @@ class MvpDecoder(MvpPreTrainedModel):
698
705
  output_hidden_states: Optional[bool] = None,
699
706
  return_dict: Optional[bool] = None,
700
707
  cache_position: Optional[torch.Tensor] = None,
708
+ **kwargs,
701
709
  ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
702
710
  r"""
703
711
  Args:
@@ -917,6 +925,7 @@ class MvpModel(MvpPreTrainedModel):
917
925
  output_hidden_states: Optional[bool] = None,
918
926
  return_dict: Optional[bool] = None,
919
927
  cache_position: Optional[torch.Tensor] = None,
928
+ **kwargs,
920
929
  ) -> Union[tuple, Seq2SeqModelOutput]:
921
930
  r"""
922
931
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1065,6 +1074,7 @@ class MvpForConditionalGeneration(MvpPreTrainedModel, GenerationMixin):
1065
1074
  output_hidden_states: Optional[bool] = None,
1066
1075
  return_dict: Optional[bool] = None,
1067
1076
  cache_position: Optional[torch.Tensor] = None,
1077
+ **kwargs,
1068
1078
  ) -> Union[tuple, Seq2SeqLMOutput]:
1069
1079
  r"""
1070
1080
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1213,6 +1223,7 @@ class MvpForSequenceClassification(MvpPreTrainedModel):
1213
1223
  output_attentions: Optional[bool] = None,
1214
1224
  output_hidden_states: Optional[bool] = None,
1215
1225
  return_dict: Optional[bool] = None,
1226
+ **kwargs,
1216
1227
  ) -> Union[tuple, Seq2SeqSequenceClassifierOutput]:
1217
1228
  r"""
1218
1229
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1372,6 +1383,7 @@ class MvpForQuestionAnswering(MvpPreTrainedModel):
1372
1383
  output_attentions: Optional[bool] = None,
1373
1384
  output_hidden_states: Optional[bool] = None,
1374
1385
  return_dict: Optional[bool] = None,
1386
+ **kwargs,
1375
1387
  ) -> Union[tuple, Seq2SeqQuestionAnsweringModelOutput]:
1376
1388
  r"""
1377
1389
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1503,6 +1515,7 @@ class MvpDecoderWrapper(MvpPreTrainedModel):
1503
1515
  def __init__(self, config):
1504
1516
  super().__init__(config)
1505
1517
  self.decoder = MvpDecoder(config)
1518
+ self.post_init()
1506
1519
 
1507
1520
  def forward(self, *args, **kwargs):
1508
1521
  return self.decoder(*args, **kwargs)
@@ -1548,6 +1561,7 @@ class MvpForCausalLM(MvpPreTrainedModel, GenerationMixin):
1548
1561
  return_dict: Optional[bool] = None,
1549
1562
  cache_position: Optional[torch.Tensor] = None,
1550
1563
  logits_to_keep: Union[int, torch.Tensor] = 0,
1564
+ **kwargs,
1551
1565
  ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
1552
1566
  r"""
1553
1567
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -30,7 +30,7 @@ from ... import initialization as init
30
30
  from ...activations import ACT2FN
31
31
  from ...cache_utils import Cache, DynamicCache
32
32
  from ...generation import GenerationMixin
33
- from ...integrations import use_kernel_func_from_hub
33
+ from ...integrations import use_kernel_func_from_hub, use_kernelized_func
34
34
  from ...masking_utils import create_causal_mask
35
35
  from ...modeling_layers import GradientCheckpointingLayer
36
36
  from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
@@ -38,7 +38,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
38
38
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
39
39
  from ...processing_utils import Unpack
40
40
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
41
- from ...utils.generic import check_model_inputs
41
+ from ...utils.generic import check_model_inputs, maybe_autocast
42
42
  from .configuration_nanochat import NanoChatConfig
43
43
 
44
44
 
@@ -74,7 +74,7 @@ class NanoChatRotaryEmbedding(nn.Module):
74
74
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
75
75
 
76
76
  self.register_buffer("inv_freq", inv_freq, persistent=False)
77
- self.original_inv_freq = inv_freq
77
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
78
78
 
79
79
  @staticmethod
80
80
  def compute_default_rope_parameters(
@@ -113,7 +113,7 @@ class NanoChatRotaryEmbedding(nn.Module):
113
113
  position_ids_expanded = position_ids[:, None, :].float()
114
114
 
115
115
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
116
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
116
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
117
117
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
118
118
  emb = torch.cat((freqs, freqs), dim=-1)
119
119
  cos = emb.cos() * self.attention_scaling
@@ -195,6 +195,7 @@ def rotate_half(x):
195
195
  return torch.cat((x2, -x1), dim=-1)
196
196
 
197
197
 
198
+ @use_kernelized_func(apply_rotary_pos_emb)
198
199
  class NanoChatAttention(nn.Module):
199
200
  """Multi-headed attention from 'Attention Is All You Need' paper"""
200
201
 
@@ -220,7 +221,6 @@ class NanoChatAttention(nn.Module):
220
221
  self.o_proj = nn.Linear(
221
222
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
222
223
  )
223
- self.rotary_fn = apply_rotary_pos_emb
224
224
 
225
225
  self.q_norm = NanoChatRMSNorm(eps=config.rms_norm_eps)
226
226
  self.k_norm = NanoChatRMSNorm(eps=config.rms_norm_eps)
@@ -45,6 +45,7 @@ from ...modeling_rope_utils import (
45
45
  )
46
46
  from ...modeling_utils import PreTrainedModel
47
47
  from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
48
+ from ...utils.generic import maybe_autocast
48
49
  from .configuration_nemotron import NemotronConfig
49
50
 
50
51
 
@@ -87,7 +88,7 @@ class NemotronLayerNorm1P(nn.LayerNorm):
87
88
  args = _cast_if_autocast_enabled(
88
89
  device_type, input, self.normalized_shape, self.weight + 1, self.bias, self.eps
89
90
  )
90
- with torch.autocast(device_type=input.device.type, enabled=False):
91
+ with maybe_autocast(device_type=input.device.type, enabled=False):
91
92
  return F.layer_norm(*args)
92
93
 
93
94
 
@@ -109,7 +110,7 @@ class NemotronRotaryEmbedding(nn.Module):
109
110
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
110
111
 
111
112
  self.register_buffer("inv_freq", inv_freq, persistent=False)
112
- self.original_inv_freq = inv_freq
113
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
113
114
 
114
115
  @staticmethod
115
116
  # Ignore copy
@@ -151,7 +152,7 @@ class NemotronRotaryEmbedding(nn.Module):
151
152
  position_ids_expanded = position_ids[:, None, :].float()
152
153
 
153
154
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
154
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
155
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
155
156
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
156
157
  emb = torch.cat((freqs, freqs), dim=-1)
157
158
  cos = emb.cos() * self.attention_scaling
@@ -396,8 +397,8 @@ class NemotronFlashAttention2(NemotronAttention):
396
397
  else torch.get_autocast_gpu_dtype()
397
398
  )
398
399
  # Handle the case where the model is quantized
399
- elif hasattr(self.config, "_pre_quantization_dtype"):
400
- target_dtype = self.config._pre_quantization_dtype
400
+ elif hasattr(self.config, "quantization_config"):
401
+ target_dtype = self.config.dtype
401
402
  else:
402
403
  target_dtype = self.q_proj.weight.dtype
403
404
 
@@ -657,6 +658,7 @@ class NemotronModel(NemotronPreTrainedModel):
657
658
  output_attentions: Optional[bool] = None,
658
659
  output_hidden_states: Optional[bool] = None,
659
660
  cache_position: Optional[torch.LongTensor] = None,
661
+ **kwargs,
660
662
  ) -> BaseModelOutputWithPast:
661
663
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
662
664
  output_hidden_states = (
@@ -13,7 +13,7 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- from typing import Optional
16
+ from typing import Optional, Union
17
17
 
18
18
  from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
19
19
  from tokenizers.models import BPE
@@ -83,13 +83,15 @@ class NllbTokenizer(TokenizersBackend):
83
83
 
84
84
  vocab_files_names = VOCAB_FILES_NAMES
85
85
  model_input_names = ["input_ids", "attention_mask"]
86
- slow_tokenizer_class = None
86
+ model = BPE
87
87
 
88
88
  prefix_tokens: list[int] = []
89
89
  suffix_tokens: list[int] = []
90
90
 
91
91
  def __init__(
92
92
  self,
93
+ vocab: Optional[Union[str, dict[str, int]]] = None,
94
+ merges: Optional[Union[str, list[str]]] = None,
93
95
  bos_token="<s>",
94
96
  eos_token="</s>",
95
97
  sep_token="</s>",
@@ -101,16 +103,11 @@ class NllbTokenizer(TokenizersBackend):
101
103
  tgt_lang=None,
102
104
  additional_special_tokens=None,
103
105
  legacy_behaviour=False,
104
- vocab=None,
105
- merges=None,
106
- vocab_file=None,
107
106
  **kwargs,
108
107
  ):
109
108
  if additional_special_tokens is None:
110
109
  additional_special_tokens = kwargs.get("extra_special_tokens", FAIRSEQ_LANGUAGE_CODES)
111
110
 
112
- self.vocab_file = vocab_file
113
-
114
111
  mask_token = (
115
112
  AddedToken(mask_token, normalized=True, lstrip=True, special=True)
116
113
  if isinstance(mask_token, str)
@@ -118,23 +115,15 @@ class NllbTokenizer(TokenizersBackend):
118
115
  )
119
116
  self.legacy_behaviour = legacy_behaviour
120
117
 
121
- if vocab is not None:
122
- if isinstance(vocab, list):
123
- self._vocab = {token: idx for idx, (token, _score) in enumerate(vocab)}
124
- else:
125
- self._vocab = vocab
126
- else:
127
- self._vocab = {
118
+ if vocab is None:
119
+ vocab = {
128
120
  str(bos_token): 0,
129
121
  str(pad_token): 1,
130
122
  str(eos_token): 2,
131
123
  str(unk_token): 3,
132
124
  }
133
-
134
- if merges is None:
135
- self._merges = []
136
- else:
137
- self._merges = merges
125
+ self._vocab = vocab
126
+ self._merges = merges or []
138
127
 
139
128
  self._tokenizer = Tokenizer(
140
129
  BPE(
@@ -158,13 +147,10 @@ class NllbTokenizer(TokenizersBackend):
158
147
  self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True)
159
148
  self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
160
149
 
161
- tokenizer_object = self._tokenizer
162
-
163
150
  # Remove extra_special_tokens from kwargs if present to avoid conflict
164
151
  kwargs.pop("extra_special_tokens", None)
165
152
 
166
153
  super().__init__(
167
- tokenizer_object=tokenizer_object,
168
154
  bos_token=bos_token,
169
155
  eos_token=eos_token,
170
156
  sep_token=sep_token,
@@ -206,6 +206,7 @@ class NllbMoeConfig(PreTrainedConfig):
206
206
  self.moe_eval_capacity_token_fraction = moe_eval_capacity_token_fraction
207
207
  self.moe_token_dropout = moe_token_dropout
208
208
  self.output_router_logits = output_router_logits
209
+
209
210
  super().__init__(
210
211
  pad_token_id=pad_token_id,
211
212
  bos_token_id=bos_token_id,