transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (671) hide show
  1. transformers/__init__.py +20 -1
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/configuration_utils.py +114 -70
  6. transformers/conversion_mapping.py +68 -5
  7. transformers/core_model_loading.py +201 -35
  8. transformers/dependency_versions_table.py +1 -1
  9. transformers/feature_extraction_utils.py +54 -22
  10. transformers/generation/candidate_generator.py +79 -31
  11. transformers/generation/configuration_utils.py +162 -122
  12. transformers/generation/continuous_batching/cache.py +47 -18
  13. transformers/generation/continuous_batching/cache_manager.py +131 -34
  14. transformers/generation/continuous_batching/continuous_api.py +101 -64
  15. transformers/generation/continuous_batching/requests.py +28 -1
  16. transformers/generation/continuous_batching/scheduler.py +11 -4
  17. transformers/generation/stopping_criteria.py +1 -1
  18. transformers/generation/utils.py +108 -110
  19. transformers/generation/watermarking.py +8 -5
  20. transformers/image_processing_base.py +2 -12
  21. transformers/image_processing_utils_fast.py +15 -4
  22. transformers/initialization.py +37 -0
  23. transformers/integrations/__init__.py +12 -0
  24. transformers/integrations/accelerate.py +44 -111
  25. transformers/integrations/aqlm.py +3 -5
  26. transformers/integrations/awq.py +2 -5
  27. transformers/integrations/bitnet.py +5 -8
  28. transformers/integrations/bitsandbytes.py +16 -15
  29. transformers/integrations/deepspeed.py +18 -3
  30. transformers/integrations/eetq.py +3 -5
  31. transformers/integrations/fbgemm_fp8.py +1 -1
  32. transformers/integrations/finegrained_fp8.py +6 -16
  33. transformers/integrations/flash_attention.py +2 -2
  34. transformers/integrations/higgs.py +2 -5
  35. transformers/integrations/hub_kernels.py +23 -5
  36. transformers/integrations/integration_utils.py +35 -0
  37. transformers/integrations/mistral.py +12 -0
  38. transformers/integrations/moe.py +240 -0
  39. transformers/integrations/mxfp4.py +4 -10
  40. transformers/integrations/peft.py +5 -0
  41. transformers/integrations/quanto.py +5 -2
  42. transformers/integrations/spqr.py +3 -5
  43. transformers/integrations/tensor_parallel.py +167 -221
  44. transformers/integrations/vptq.py +3 -5
  45. transformers/modeling_gguf_pytorch_utils.py +66 -19
  46. transformers/modeling_rope_utils.py +78 -81
  47. transformers/modeling_utils.py +583 -503
  48. transformers/models/__init__.py +19 -0
  49. transformers/models/afmoe/modeling_afmoe.py +7 -16
  50. transformers/models/afmoe/modular_afmoe.py +5 -13
  51. transformers/models/aimv2/modeling_aimv2.py +4 -0
  52. transformers/models/aimv2/modular_aimv2.py +4 -0
  53. transformers/models/albert/modeling_albert.py +3 -0
  54. transformers/models/align/modeling_align.py +12 -6
  55. transformers/models/altclip/modeling_altclip.py +7 -3
  56. transformers/models/apertus/modeling_apertus.py +4 -2
  57. transformers/models/apertus/modular_apertus.py +4 -1
  58. transformers/models/arcee/modeling_arcee.py +1 -1
  59. transformers/models/aria/modeling_aria.py +8 -4
  60. transformers/models/aria/modular_aria.py +7 -3
  61. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  62. transformers/models/auto/auto_factory.py +1 -1
  63. transformers/models/auto/configuration_auto.py +27 -0
  64. transformers/models/auto/feature_extraction_auto.py +7 -3
  65. transformers/models/auto/image_processing_auto.py +4 -2
  66. transformers/models/auto/modeling_auto.py +31 -0
  67. transformers/models/auto/processing_auto.py +4 -0
  68. transformers/models/auto/tokenization_auto.py +132 -153
  69. transformers/models/auto/video_processing_auto.py +5 -2
  70. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  71. transformers/models/bamba/modeling_bamba.py +18 -19
  72. transformers/models/bamba/modular_bamba.py +17 -16
  73. transformers/models/bark/modeling_bark.py +9 -0
  74. transformers/models/bart/configuration_bart.py +0 -1
  75. transformers/models/bart/modeling_bart.py +7 -0
  76. transformers/models/beit/image_processing_beit_fast.py +0 -1
  77. transformers/models/bert/modeling_bert.py +3 -0
  78. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  79. transformers/models/big_bird/modeling_big_bird.py +3 -0
  80. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
  81. transformers/models/bit/modeling_bit.py +5 -1
  82. transformers/models/bitnet/modeling_bitnet.py +1 -1
  83. transformers/models/blenderbot/modeling_blenderbot.py +7 -0
  84. transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
  85. transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
  86. transformers/models/blip/modeling_blip.py +2 -0
  87. transformers/models/blip/modeling_blip_text.py +8 -0
  88. transformers/models/blip_2/modeling_blip_2.py +2 -0
  89. transformers/models/bloom/modeling_bloom.py +13 -44
  90. transformers/models/blt/modeling_blt.py +162 -2
  91. transformers/models/blt/modular_blt.py +168 -3
  92. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  93. transformers/models/bridgetower/modeling_bridgetower.py +6 -0
  94. transformers/models/bros/modeling_bros.py +8 -0
  95. transformers/models/camembert/modeling_camembert.py +109 -106
  96. transformers/models/canine/modeling_canine.py +6 -0
  97. transformers/models/canine/tokenization_canine.py +2 -0
  98. transformers/models/chameleon/modeling_chameleon.py +9 -4
  99. transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
  100. transformers/models/clap/feature_extraction_clap.py +2 -2
  101. transformers/models/clap/modeling_clap.py +25 -15
  102. transformers/models/clip/modeling_clip.py +2 -0
  103. transformers/models/clipseg/modeling_clipseg.py +4 -0
  104. transformers/models/clvp/modeling_clvp.py +14 -3
  105. transformers/models/code_llama/tokenization_code_llama.py +1 -1
  106. transformers/models/codegen/modeling_codegen.py +13 -4
  107. transformers/models/cohere/modeling_cohere.py +1 -1
  108. transformers/models/cohere2/modeling_cohere2.py +1 -1
  109. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
  110. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  111. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  112. transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
  113. transformers/models/convbert/modeling_convbert.py +3 -0
  114. transformers/models/convnext/image_processing_convnext.py +2 -2
  115. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  116. transformers/models/csm/generation_csm.py +19 -22
  117. transformers/models/csm/modeling_csm.py +3 -1
  118. transformers/models/csm/modular_csm.py +2 -0
  119. transformers/models/ctrl/modeling_ctrl.py +14 -2
  120. transformers/models/cvt/modeling_cvt.py +5 -1
  121. transformers/models/cwm/modeling_cwm.py +1 -1
  122. transformers/models/d_fine/configuration_d_fine.py +3 -4
  123. transformers/models/d_fine/modeling_d_fine.py +46 -39
  124. transformers/models/d_fine/modular_d_fine.py +15 -4
  125. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  126. transformers/models/dab_detr/modeling_dab_detr.py +1 -1
  127. transformers/models/dac/modeling_dac.py +4 -4
  128. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  129. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  130. transformers/models/dbrx/configuration_dbrx.py +9 -1
  131. transformers/models/dbrx/modeling_dbrx.py +1 -1
  132. transformers/models/deberta/modeling_deberta.py +2 -0
  133. transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
  134. transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
  135. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
  136. transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
  137. transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
  138. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
  139. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  140. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  141. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  142. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  143. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  144. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  145. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  146. transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
  147. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  148. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  149. transformers/models/detr/configuration_detr.py +1 -1
  150. transformers/models/detr/modeling_detr.py +8 -1
  151. transformers/models/dia/generation_dia.py +3 -10
  152. transformers/models/dia/modeling_dia.py +12 -1
  153. transformers/models/dia/modular_dia.py +11 -0
  154. transformers/models/dia/processing_dia.py +1 -1
  155. transformers/models/diffllama/modeling_diffllama.py +3 -3
  156. transformers/models/diffllama/modular_diffllama.py +2 -2
  157. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  158. transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
  159. transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
  160. transformers/models/distilbert/modeling_distilbert.py +11 -9
  161. transformers/models/doge/modeling_doge.py +1 -1
  162. transformers/models/donut/image_processing_donut_fast.py +0 -1
  163. transformers/models/donut/modeling_donut_swin.py +16 -12
  164. transformers/models/dots1/modeling_dots1.py +14 -5
  165. transformers/models/dpt/configuration_dpt.py +1 -1
  166. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  167. transformers/models/dpt/modular_dpt.py +1 -2
  168. transformers/models/edgetam/configuration_edgetam.py +1 -1
  169. transformers/models/edgetam/modeling_edgetam.py +5 -2
  170. transformers/models/edgetam/modular_edgetam.py +15 -14
  171. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
  172. transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
  173. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  174. transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
  175. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  176. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  177. transformers/models/efficientnet/modeling_efficientnet.py +5 -1
  178. transformers/models/electra/modeling_electra.py +7 -0
  179. transformers/models/emu3/modeling_emu3.py +8 -2
  180. transformers/models/emu3/modular_emu3.py +7 -1
  181. transformers/models/encodec/modeling_encodec.py +14 -0
  182. transformers/models/eomt/image_processing_eomt_fast.py +46 -14
  183. transformers/models/eomt/modeling_eomt.py +7 -0
  184. transformers/models/eomt/modular_eomt.py +7 -0
  185. transformers/models/ernie/modeling_ernie.py +6 -0
  186. transformers/models/ernie/modular_ernie.py +6 -0
  187. transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
  188. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
  189. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
  190. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  191. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  192. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  193. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  194. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  195. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  196. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  197. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  198. transformers/models/esm/modeling_esm.py +6 -0
  199. transformers/models/esm/modeling_esmfold.py +6 -1
  200. transformers/models/evolla/modeling_evolla.py +9 -1
  201. transformers/models/evolla/modular_evolla.py +8 -0
  202. transformers/models/exaone4/modeling_exaone4.py +1 -1
  203. transformers/models/falcon/modeling_falcon.py +3 -3
  204. transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
  205. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  206. transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
  207. transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
  208. transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
  209. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
  210. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  211. transformers/models/flaubert/modeling_flaubert.py +14 -15
  212. transformers/models/flava/image_processing_flava_fast.py +0 -2
  213. transformers/models/flava/modeling_flava.py +4 -1
  214. transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
  215. transformers/models/florence2/modeling_florence2.py +20 -3
  216. transformers/models/florence2/modular_florence2.py +13 -0
  217. transformers/models/fnet/modeling_fnet.py +7 -0
  218. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  219. transformers/models/fuyu/modeling_fuyu.py +3 -1
  220. transformers/models/fuyu/processing_fuyu.py +16 -0
  221. transformers/models/gemma/modeling_gemma.py +10 -12
  222. transformers/models/gemma/modular_gemma.py +9 -11
  223. transformers/models/gemma2/modeling_gemma2.py +1 -1
  224. transformers/models/gemma2/modular_gemma2.py +1 -1
  225. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  226. transformers/models/gemma3/modeling_gemma3.py +28 -7
  227. transformers/models/gemma3/modular_gemma3.py +26 -6
  228. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  229. transformers/models/gemma3n/modeling_gemma3n.py +47 -9
  230. transformers/models/gemma3n/modular_gemma3n.py +51 -9
  231. transformers/models/git/modeling_git.py +181 -126
  232. transformers/models/glm/modeling_glm.py +1 -1
  233. transformers/models/glm4/modeling_glm4.py +1 -1
  234. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  235. transformers/models/glm46v/modeling_glm46v.py +3 -1
  236. transformers/models/glm46v/modular_glm46v.py +3 -0
  237. transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
  238. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  239. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  240. transformers/models/glm4v/modeling_glm4v.py +15 -5
  241. transformers/models/glm4v/modular_glm4v.py +11 -3
  242. transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
  243. transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
  244. transformers/models/glmasr/__init__.py +30 -0
  245. transformers/models/glmasr/configuration_glmasr.py +197 -0
  246. transformers/models/glmasr/modeling_glmasr.py +512 -0
  247. transformers/models/glmasr/modular_glmasr.py +433 -0
  248. transformers/models/glmasr/processing_glmasr.py +332 -0
  249. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  250. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  251. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  252. transformers/models/gpt2/modeling_gpt2.py +8 -5
  253. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
  254. transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
  255. transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
  256. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
  257. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  258. transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
  259. transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
  260. transformers/models/gptj/modeling_gptj.py +15 -6
  261. transformers/models/granite/modeling_granite.py +1 -1
  262. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  263. transformers/models/granitemoe/modeling_granitemoe.py +2 -3
  264. transformers/models/granitemoe/modular_granitemoe.py +1 -2
  265. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  266. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
  267. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  268. transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
  269. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  270. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
  271. transformers/models/groupvit/modeling_groupvit.py +6 -1
  272. transformers/models/helium/modeling_helium.py +1 -1
  273. transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
  274. transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
  275. transformers/models/hubert/modeling_hubert.py +4 -0
  276. transformers/models/hubert/modular_hubert.py +4 -0
  277. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
  278. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  279. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  280. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
  281. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  282. transformers/models/ibert/modeling_ibert.py +16 -0
  283. transformers/models/idefics/modeling_idefics.py +10 -0
  284. transformers/models/idefics2/modeling_idefics2.py +7 -1
  285. transformers/models/idefics3/modeling_idefics3.py +5 -1
  286. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  287. transformers/models/imagegpt/modeling_imagegpt.py +9 -2
  288. transformers/models/instructblip/modeling_instructblip.py +2 -0
  289. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  290. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  291. transformers/models/internvl/modeling_internvl.py +11 -8
  292. transformers/models/internvl/modular_internvl.py +5 -9
  293. transformers/models/internvl/video_processing_internvl.py +0 -1
  294. transformers/models/jais2/__init__.py +27 -0
  295. transformers/models/jais2/configuration_jais2.py +152 -0
  296. transformers/models/jais2/modeling_jais2.py +486 -0
  297. transformers/models/jais2/modular_jais2.py +196 -0
  298. transformers/models/jamba/modeling_jamba.py +24 -19
  299. transformers/models/jamba/modular_jamba.py +17 -17
  300. transformers/models/janus/image_processing_janus_fast.py +0 -1
  301. transformers/models/janus/modeling_janus.py +15 -7
  302. transformers/models/janus/modular_janus.py +16 -7
  303. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  304. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  305. transformers/models/kosmos2/modeling_kosmos2.py +14 -2
  306. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  307. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  308. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
  309. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  310. transformers/models/lasr/configuration_lasr.py +4 -0
  311. transformers/models/lasr/modeling_lasr.py +3 -2
  312. transformers/models/lasr/modular_lasr.py +8 -1
  313. transformers/models/lasr/processing_lasr.py +0 -2
  314. transformers/models/layoutlm/modeling_layoutlm.py +5 -3
  315. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  316. transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
  317. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
  318. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  319. transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
  320. transformers/models/led/modeling_led.py +6 -0
  321. transformers/models/levit/modeling_levit.py +18 -0
  322. transformers/models/lfm2/modeling_lfm2.py +1 -1
  323. transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
  324. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  325. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  326. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  327. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  328. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  329. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  330. transformers/models/lilt/modeling_lilt.py +19 -15
  331. transformers/models/llama/modeling_llama.py +1 -1
  332. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  333. transformers/models/llama4/modeling_llama4.py +8 -4
  334. transformers/models/llava/image_processing_llava_fast.py +0 -1
  335. transformers/models/llava/modeling_llava.py +12 -7
  336. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  337. transformers/models/llava_next/modeling_llava_next.py +7 -3
  338. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  339. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  340. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  341. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  342. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  343. transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
  344. transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
  345. transformers/models/longt5/modeling_longt5.py +0 -4
  346. transformers/models/m2m_100/modeling_m2m_100.py +10 -0
  347. transformers/models/mamba/modeling_mamba.py +2 -1
  348. transformers/models/mamba2/modeling_mamba2.py +24 -23
  349. transformers/models/marian/configuration_marian.py +1 -1
  350. transformers/models/marian/modeling_marian.py +3 -0
  351. transformers/models/markuplm/modeling_markuplm.py +5 -8
  352. transformers/models/mask2former/configuration_mask2former.py +3 -3
  353. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  354. transformers/models/mask2former/modeling_mask2former.py +9 -0
  355. transformers/models/maskformer/configuration_maskformer.py +3 -3
  356. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  357. transformers/models/maskformer/modeling_maskformer.py +9 -1
  358. transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
  359. transformers/models/mbart/configuration_mbart.py +1 -0
  360. transformers/models/mbart/modeling_mbart.py +7 -0
  361. transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
  362. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  363. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  364. transformers/models/mimi/modeling_mimi.py +25 -4
  365. transformers/models/minimax/modeling_minimax.py +16 -3
  366. transformers/models/minimax/modular_minimax.py +12 -1
  367. transformers/models/ministral/modeling_ministral.py +1 -1
  368. transformers/models/ministral3/modeling_ministral3.py +1 -1
  369. transformers/models/mistral/modeling_mistral.py +1 -1
  370. transformers/models/mistral3/modeling_mistral3.py +10 -4
  371. transformers/models/mistral3/modular_mistral3.py +3 -1
  372. transformers/models/mixtral/modeling_mixtral.py +12 -4
  373. transformers/models/mixtral/modular_mixtral.py +6 -2
  374. transformers/models/mlcd/modeling_mlcd.py +6 -0
  375. transformers/models/mlcd/modular_mlcd.py +4 -0
  376. transformers/models/mllama/modeling_mllama.py +13 -2
  377. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  378. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
  379. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  380. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  381. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  382. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  383. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  384. transformers/models/mobilevit/modeling_mobilevit.py +4 -0
  385. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
  386. transformers/models/modernbert/modeling_modernbert.py +12 -1
  387. transformers/models/modernbert/modular_modernbert.py +12 -1
  388. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
  389. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
  390. transformers/models/moonshine/modeling_moonshine.py +1 -1
  391. transformers/models/moshi/modeling_moshi.py +21 -51
  392. transformers/models/mpnet/modeling_mpnet.py +2 -0
  393. transformers/models/mra/modeling_mra.py +4 -1
  394. transformers/models/mt5/configuration_mt5.py +2 -3
  395. transformers/models/mt5/modeling_mt5.py +0 -10
  396. transformers/models/musicgen/modeling_musicgen.py +5 -9
  397. transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
  398. transformers/models/mvp/modeling_mvp.py +7 -0
  399. transformers/models/nanochat/modeling_nanochat.py +1 -1
  400. transformers/models/nemotron/modeling_nemotron.py +3 -3
  401. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  402. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  403. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  404. transformers/models/nougat/tokenization_nougat.py +11 -16
  405. transformers/models/nystromformer/modeling_nystromformer.py +7 -0
  406. transformers/models/olmo/modeling_olmo.py +1 -1
  407. transformers/models/olmo2/modeling_olmo2.py +1 -1
  408. transformers/models/olmo3/modeling_olmo3.py +1 -1
  409. transformers/models/olmoe/modeling_olmoe.py +12 -4
  410. transformers/models/olmoe/modular_olmoe.py +4 -2
  411. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  412. transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
  413. transformers/models/oneformer/configuration_oneformer.py +3 -3
  414. transformers/models/oneformer/modeling_oneformer.py +7 -38
  415. transformers/models/openai/modeling_openai.py +12 -0
  416. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  417. transformers/models/ovis2/modeling_ovis2.py +15 -3
  418. transformers/models/ovis2/modular_ovis2.py +8 -0
  419. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  420. transformers/models/owlv2/modeling_owlv2.py +7 -3
  421. transformers/models/owlv2/modular_owlv2.py +0 -2
  422. transformers/models/owlvit/modeling_owlvit.py +7 -3
  423. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
  424. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
  425. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
  426. transformers/models/paligemma/modeling_paligemma.py +25 -17
  427. transformers/models/parakeet/modeling_parakeet.py +5 -0
  428. transformers/models/parakeet/modular_parakeet.py +5 -0
  429. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  430. transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
  431. transformers/models/patchtst/modeling_patchtst.py +5 -4
  432. transformers/models/pe_audio/__init__.py +30 -0
  433. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  434. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  435. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  436. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  437. transformers/models/pe_audio/processing_pe_audio.py +24 -0
  438. transformers/models/pe_audio_video/__init__.py +29 -0
  439. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  440. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  441. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  442. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  443. transformers/models/pe_video/__init__.py +30 -0
  444. transformers/models/pe_video/configuration_pe_video.py +211 -0
  445. transformers/models/pe_video/modeling_pe_video.py +636 -0
  446. transformers/models/pe_video/modular_pe_video.py +219 -0
  447. transformers/models/pe_video/processing_pe_video.py +10 -0
  448. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  449. transformers/models/pegasus/configuration_pegasus.py +1 -0
  450. transformers/models/pegasus/modeling_pegasus.py +3 -0
  451. transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
  452. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  453. transformers/models/perceiver/modeling_perceiver.py +5 -1
  454. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  455. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  456. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  457. transformers/models/persimmon/modeling_persimmon.py +1 -1
  458. transformers/models/phi/modeling_phi.py +1 -1
  459. transformers/models/phi3/modeling_phi3.py +1 -1
  460. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
  461. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
  462. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  463. transformers/models/phimoe/modeling_phimoe.py +12 -4
  464. transformers/models/phimoe/modular_phimoe.py +1 -1
  465. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  466. transformers/models/pixio/__init__.py +30 -0
  467. transformers/models/pixio/configuration_pixio.py +151 -0
  468. transformers/models/pixio/modeling_pixio.py +507 -0
  469. transformers/models/pixio/modular_pixio.py +404 -0
  470. transformers/models/pixtral/modeling_pixtral.py +1 -1
  471. transformers/models/pixtral/processing_pixtral.py +3 -1
  472. transformers/models/plbart/configuration_plbart.py +1 -0
  473. transformers/models/plbart/modeling_plbart.py +7 -0
  474. transformers/models/plbart/modular_plbart.py +6 -0
  475. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  476. transformers/models/poolformer/modeling_poolformer.py +11 -1
  477. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  478. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  479. transformers/models/prophetnet/modeling_prophetnet.py +2 -1
  480. transformers/models/qwen2/modeling_qwen2.py +1 -1
  481. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
  482. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
  483. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
  484. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
  485. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
  486. transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
  487. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  488. transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
  489. transformers/models/qwen3/modeling_qwen3.py +1 -1
  490. transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
  491. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
  492. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  493. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
  494. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
  495. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  496. transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
  497. transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
  498. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  499. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
  500. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
  501. transformers/models/rag/configuration_rag.py +0 -8
  502. transformers/models/rag/modeling_rag.py +7 -9
  503. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
  504. transformers/models/reformer/modeling_reformer.py +9 -1
  505. transformers/models/regnet/modeling_regnet.py +4 -0
  506. transformers/models/rembert/modeling_rembert.py +7 -1
  507. transformers/models/resnet/modeling_resnet.py +8 -3
  508. transformers/models/roberta/modeling_roberta.py +3 -0
  509. transformers/models/roberta/modular_roberta.py +3 -0
  510. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  511. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  512. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  513. transformers/models/rt_detr/modeling_rt_detr.py +4 -0
  514. transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
  515. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  516. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
  517. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  518. transformers/models/rwkv/modeling_rwkv.py +1 -1
  519. transformers/models/sam/configuration_sam.py +1 -0
  520. transformers/models/sam/image_processing_sam_fast.py +0 -1
  521. transformers/models/sam/modeling_sam.py +4 -1
  522. transformers/models/sam2/configuration_sam2.py +1 -1
  523. transformers/models/sam2/modeling_sam2.py +5 -1
  524. transformers/models/sam2/modular_sam2.py +5 -1
  525. transformers/models/sam2_video/modeling_sam2_video.py +51 -43
  526. transformers/models/sam2_video/modular_sam2_video.py +31 -18
  527. transformers/models/sam3/configuration_sam3.py +21 -1
  528. transformers/models/sam3/modeling_sam3.py +23 -0
  529. transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
  530. transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
  531. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  532. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
  533. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  534. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  535. transformers/models/sam3_video/modeling_sam3_video.py +3 -3
  536. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  537. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  538. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  539. transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
  540. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
  541. transformers/models/seed_oss/modeling_seed_oss.py +1 -1
  542. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  543. transformers/models/segformer/modeling_segformer.py +2 -2
  544. transformers/models/segformer/modular_segformer.py +0 -1
  545. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  546. transformers/models/siglip/modeling_siglip.py +24 -2
  547. transformers/models/siglip2/modeling_siglip2.py +63 -41
  548. transformers/models/smollm3/modeling_smollm3.py +1 -1
  549. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  550. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  551. transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
  552. transformers/models/speecht5/modeling_speecht5.py +28 -0
  553. transformers/models/splinter/modeling_splinter.py +9 -3
  554. transformers/models/squeezebert/modeling_squeezebert.py +2 -0
  555. transformers/models/stablelm/modeling_stablelm.py +1 -1
  556. transformers/models/starcoder2/modeling_starcoder2.py +1 -1
  557. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  558. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  559. transformers/models/swiftformer/modeling_swiftformer.py +4 -0
  560. transformers/models/swin/modeling_swin.py +16 -12
  561. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  562. transformers/models/swin2sr/modeling_swin2sr.py +49 -33
  563. transformers/models/swinv2/modeling_swinv2.py +41 -33
  564. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  565. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  566. transformers/models/t5/configuration_t5.py +7 -1
  567. transformers/models/t5/modeling_t5.py +1 -7
  568. transformers/models/t5gemma/modeling_t5gemma.py +1 -1
  569. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  570. transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
  571. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  572. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  573. transformers/models/table_transformer/modeling_table_transformer.py +1 -1
  574. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  575. transformers/models/timesfm/modeling_timesfm.py +12 -0
  576. transformers/models/timesfm/modular_timesfm.py +12 -0
  577. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  578. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  579. transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
  580. transformers/models/trocr/modeling_trocr.py +1 -2
  581. transformers/models/tvp/configuration_tvp.py +5 -1
  582. transformers/models/tvp/modeling_tvp.py +4 -4
  583. transformers/models/udop/configuration_udop.py +1 -0
  584. transformers/models/udop/modeling_udop.py +3 -7
  585. transformers/models/umt5/configuration_umt5.py +2 -2
  586. transformers/models/umt5/modeling_umt5.py +0 -6
  587. transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
  588. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  589. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  590. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  591. transformers/models/video_llava/modeling_video_llava.py +7 -3
  592. transformers/models/vilt/configuration_vilt.py +2 -2
  593. transformers/models/vilt/modeling_vilt.py +7 -0
  594. transformers/models/vipllava/modeling_vipllava.py +7 -3
  595. transformers/models/visual_bert/modeling_visual_bert.py +2 -0
  596. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  597. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  598. transformers/models/vitmatte/modeling_vitmatte.py +4 -0
  599. transformers/models/vitpose/configuration_vitpose.py +1 -1
  600. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  601. transformers/models/voxtral/modeling_voxtral.py +2 -2
  602. transformers/models/voxtral/modular_voxtral.py +2 -2
  603. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
  604. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
  605. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
  606. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  607. transformers/models/whisper/generation_whisper.py +1 -0
  608. transformers/models/whisper/modeling_whisper.py +5 -3
  609. transformers/models/x_clip/modeling_x_clip.py +2 -0
  610. transformers/models/xcodec/modeling_xcodec.py +5 -0
  611. transformers/models/xglm/modeling_xglm.py +10 -0
  612. transformers/models/xlm/modeling_xlm.py +13 -14
  613. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  614. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  615. transformers/models/xlnet/modeling_xlnet.py +3 -1
  616. transformers/models/xmod/modeling_xmod.py +3 -0
  617. transformers/models/yoso/modeling_yoso.py +4 -1
  618. transformers/models/zamba/modeling_zamba.py +2 -1
  619. transformers/models/zamba2/modeling_zamba2.py +3 -2
  620. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  621. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  622. transformers/models/zoedepth/modeling_zoedepth.py +7 -0
  623. transformers/pipelines/__init__.py +9 -6
  624. transformers/pipelines/automatic_speech_recognition.py +20 -12
  625. transformers/pipelines/base.py +1 -1
  626. transformers/pipelines/document_question_answering.py +1 -1
  627. transformers/pipelines/question_answering.py +1 -1
  628. transformers/pipelines/text_to_audio.py +2 -2
  629. transformers/processing_utils.py +127 -56
  630. transformers/quantizers/auto.py +2 -4
  631. transformers/quantizers/base.py +9 -64
  632. transformers/quantizers/quantizer_aqlm.py +1 -18
  633. transformers/quantizers/quantizer_auto_round.py +1 -10
  634. transformers/quantizers/quantizer_awq.py +3 -8
  635. transformers/quantizers/quantizer_bitnet.py +1 -6
  636. transformers/quantizers/quantizer_bnb_4bit.py +9 -49
  637. transformers/quantizers/quantizer_bnb_8bit.py +9 -19
  638. transformers/quantizers/quantizer_compressed_tensors.py +1 -4
  639. transformers/quantizers/quantizer_eetq.py +2 -12
  640. transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
  641. transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
  642. transformers/quantizers/quantizer_fp_quant.py +4 -4
  643. transformers/quantizers/quantizer_gptq.py +1 -4
  644. transformers/quantizers/quantizer_higgs.py +2 -6
  645. transformers/quantizers/quantizer_mxfp4.py +2 -28
  646. transformers/quantizers/quantizer_quanto.py +14 -14
  647. transformers/quantizers/quantizer_spqr.py +3 -8
  648. transformers/quantizers/quantizer_torchao.py +28 -124
  649. transformers/quantizers/quantizer_vptq.py +1 -10
  650. transformers/testing_utils.py +28 -12
  651. transformers/tokenization_mistral_common.py +3 -2
  652. transformers/tokenization_utils_base.py +3 -2
  653. transformers/tokenization_utils_tokenizers.py +25 -2
  654. transformers/trainer.py +24 -2
  655. transformers/trainer_callback.py +8 -0
  656. transformers/trainer_seq2seq.py +4 -0
  657. transformers/training_args.py +8 -10
  658. transformers/utils/__init__.py +4 -0
  659. transformers/utils/attention_visualizer.py +4 -4
  660. transformers/utils/auto_docstring.py +34 -25
  661. transformers/utils/generic.py +20 -0
  662. transformers/utils/import_utils.py +51 -9
  663. transformers/utils/kernel_config.py +71 -18
  664. transformers/utils/quantization_config.py +8 -8
  665. transformers/video_processing_utils.py +16 -12
  666. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
  667. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
  668. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
  669. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  670. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
  671. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -79,7 +79,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
79
79
  size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
80
80
  Controls the size of the output image after resizing. Can be overridden by the `size` parameter in the
81
81
  `preprocess` method.
82
- resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
82
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
83
83
  Defines the resampling filter to use if resizing the image. Can be overridden by the `resample` parameter
84
84
  in the `preprocess` method.
85
85
  do_rescale (`bool`, *optional*, defaults to `True`):
@@ -112,7 +112,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
112
112
  self,
113
113
  do_resize: bool = True,
114
114
  size: Optional[dict[str, int]] = None,
115
- resample: PILImageResampling = PILImageResampling.BILINEAR,
115
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
116
116
  do_rescale: bool = True,
117
117
  rescale_factor: Union[int, float] = 1 / 255,
118
118
  do_center_crop: bool = True,
@@ -137,12 +137,12 @@ class MobileViTImageProcessor(BaseImageProcessor):
137
137
  self.do_flip_channel_order = do_flip_channel_order
138
138
  self.do_reduce_labels = do_reduce_labels
139
139
 
140
- # Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize with PILImageResampling.BICUBIC->PILImageResampling.BILINEAR
140
+ # Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize
141
141
  def resize(
142
142
  self,
143
143
  image: np.ndarray,
144
144
  size: dict[str, int],
145
- resample: PILImageResampling = PILImageResampling.BILINEAR,
145
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
146
146
  data_format: Optional[Union[str, ChannelDimension]] = None,
147
147
  input_data_format: Optional[Union[str, ChannelDimension]] = None,
148
148
  **kwargs,
@@ -156,7 +156,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
156
156
  Image to resize.
157
157
  size (`dict[str, int]`):
158
158
  Size of the output image.
159
- resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
159
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
160
160
  Resampling filter to use when resiizing the image.
161
161
  data_format (`str` or `ChannelDimension`, *optional*):
162
162
  The channel dimension format of the image. If not provided, it will be the same as the input image.
@@ -42,7 +42,7 @@ from .image_processing_mobilevit import MobileVitImageProcessorKwargs
42
42
 
43
43
  @auto_docstring
44
44
  class MobileViTImageProcessorFast(BaseImageProcessorFast):
45
- resample = PILImageResampling.BILINEAR
45
+ resample = PILImageResampling.BICUBIC
46
46
  size = {"shortest_edge": 224}
47
47
  default_to_square = False
48
48
  crop_size = {"height": 256, "width": 256}
@@ -182,7 +182,6 @@ class MobileViTImageProcessorFast(BaseImageProcessorFast):
182
182
  processed_images = reorder_images(processed_images_grouped, grouped_images_index)
183
183
 
184
184
  # Stack all processed images if return_tensors is specified
185
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
186
185
 
187
186
  return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
188
187
 
@@ -615,6 +615,10 @@ class MobileViTPreTrainedModel(PreTrainedModel):
615
615
  init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
616
616
  if module.bias is not None:
617
617
  init.zeros_(module.bias)
618
+ if getattr(module, "running_mean", None) is not None:
619
+ init.zeros_(module.running_mean)
620
+ init.ones_(module.running_var)
621
+ init.zeros_(module.num_batches_tracked)
618
622
  elif isinstance(module, nn.LayerNorm):
619
623
  init.zeros_(module.bias)
620
624
  init.ones_(module.weight)
@@ -582,6 +582,10 @@ class MobileViTV2PreTrainedModel(PreTrainedModel):
582
582
  init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
583
583
  if module.bias is not None:
584
584
  init.zeros_(module.bias)
585
+ if getattr(module, "running_mean", None) is not None:
586
+ init.zeros_(module.running_mean)
587
+ init.ones_(module.running_var)
588
+ init.zeros_(module.num_batches_tracked)
585
589
  elif isinstance(module, nn.GroupNorm):
586
590
  init.zeros_(module.bias)
587
591
  init.ones_(module.weight)
@@ -268,7 +268,7 @@ class ModernBertRotaryEmbedding(nn.Module):
268
268
  rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type[layer_type]]
269
269
  curr_inv_freq, curr_attention_scaling = rope_init_fn(self.config, device, layer_type=layer_type)
270
270
  self.register_buffer(f"{layer_type}_inv_freq", curr_inv_freq, persistent=False)
271
- setattr(self, f"{layer_type}_original_inv_freq", curr_inv_freq)
271
+ self.register_buffer(f"{layer_type}_original_inv_freq", curr_inv_freq.clone(), persistent=False)
272
272
  setattr(self, f"{layer_type}_attention_scaling", curr_attention_scaling)
273
273
 
274
274
  @staticmethod
@@ -677,6 +677,17 @@ class ModernBertPreTrainedModel(PreTrainedModel):
677
677
  init.ones_(module.weight)
678
678
  if module.bias is not None:
679
679
  init.zeros_(module.bias)
680
+ elif isinstance(module, ModernBertRotaryEmbedding):
681
+ for layer_type in module.layer_types:
682
+ rope_init_fn = module.compute_default_rope_parameters
683
+ if module.rope_type[layer_type] != "default":
684
+ rope_init_fn = ROPE_INIT_FUNCTIONS[module.rope_type[layer_type]]
685
+ curr_inv_freq, _ = rope_init_fn(module.config, layer_type=layer_type)
686
+ init.copy_(getattr(module, f"{layer_type}_inv_freq"), curr_inv_freq)
687
+ init.copy_(getattr(module, f"{layer_type}_original_inv_freq"), curr_inv_freq)
688
+ elif isinstance(module, ModernBertUnpaddedRotaryEmbedding):
689
+ inv_freq = module._compute_inv_freq()
690
+ init.copy_(module.inv_freq, inv_freq)
680
691
 
681
692
  def _check_and_adjust_attn_implementation(
682
693
  self, attn_implementation: Optional[str], is_init_check: bool = False
@@ -35,7 +35,7 @@ from ...modeling_outputs import (
35
35
  SequenceClassifierOutput,
36
36
  TokenClassifierOutput,
37
37
  )
38
- from ...modeling_rope_utils import RopeParameters
38
+ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, RopeParameters
39
39
  from ...modeling_utils import PreTrainedModel
40
40
  from ...utils import auto_docstring, is_flash_attn_2_available, logging
41
41
  from ...utils.import_utils import is_triton_available
@@ -871,6 +871,17 @@ class ModernBertPreTrainedModel(PreTrainedModel):
871
871
  init.ones_(module.weight)
872
872
  if module.bias is not None:
873
873
  init.zeros_(module.bias)
874
+ elif isinstance(module, ModernBertRotaryEmbedding):
875
+ for layer_type in module.layer_types:
876
+ rope_init_fn = module.compute_default_rope_parameters
877
+ if module.rope_type[layer_type] != "default":
878
+ rope_init_fn = ROPE_INIT_FUNCTIONS[module.rope_type[layer_type]]
879
+ curr_inv_freq, _ = rope_init_fn(module.config, layer_type=layer_type)
880
+ init.copy_(getattr(module, f"{layer_type}_inv_freq"), curr_inv_freq)
881
+ init.copy_(getattr(module, f"{layer_type}_original_inv_freq"), curr_inv_freq)
882
+ elif isinstance(module, ModernBertUnpaddedRotaryEmbedding):
883
+ inv_freq = module._compute_inv_freq()
884
+ init.copy_(module.inv_freq, inv_freq)
874
885
 
875
886
  def _check_and_adjust_attn_implementation(
876
887
  self, attn_implementation: Optional[str], is_init_check: bool = False
@@ -119,7 +119,7 @@ class ModernBertDecoderRotaryEmbedding(nn.Module):
119
119
  rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type[layer_type]]
120
120
  curr_inv_freq, curr_attention_scaling = rope_init_fn(self.config, device, layer_type=layer_type)
121
121
  self.register_buffer(f"{layer_type}_inv_freq", curr_inv_freq, persistent=False)
122
- setattr(self, f"{layer_type}_original_inv_freq", curr_inv_freq)
122
+ self.register_buffer(f"{layer_type}_original_inv_freq", curr_inv_freq.clone(), persistent=False)
123
123
  setattr(self, f"{layer_type}_attention_scaling", curr_attention_scaling)
124
124
 
125
125
  @staticmethod
@@ -443,6 +443,14 @@ class ModernBertDecoderPreTrainedModel(PreTrainedModel):
443
443
  init.ones_(module.weight)
444
444
  if module.bias is not None:
445
445
  init.zeros_(module.bias)
446
+ elif isinstance(module, ModernBertDecoderRotaryEmbedding):
447
+ for layer_type in module.layer_types:
448
+ rope_init_fn = module.compute_default_rope_parameters
449
+ if module.rope_type[layer_type] != "default":
450
+ rope_init_fn = ROPE_INIT_FUNCTIONS[module.rope_type[layer_type]]
451
+ curr_inv_freq, _ = rope_init_fn(module.config, layer_type=layer_type)
452
+ init.copy_(getattr(module, f"{layer_type}_inv_freq"), curr_inv_freq)
453
+ init.copy_(getattr(module, f"{layer_type}_original_inv_freq"), curr_inv_freq)
446
454
 
447
455
 
448
456
  @auto_docstring
@@ -28,7 +28,7 @@ from ...generation import GenerationMixin
28
28
  from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
29
29
  from ...modeling_layers import GradientCheckpointingLayer
30
30
  from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
31
- from ...modeling_rope_utils import RopeParameters
31
+ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, RopeParameters
32
32
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
33
33
  from ...processing_utils import Unpack
34
34
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
@@ -482,6 +482,14 @@ class ModernBertDecoderPreTrainedModel(ModernBertPreTrainedModel):
482
482
  init.ones_(module.weight)
483
483
  if module.bias is not None:
484
484
  init.zeros_(module.bias)
485
+ elif isinstance(module, ModernBertDecoderRotaryEmbedding):
486
+ for layer_type in module.layer_types:
487
+ rope_init_fn = module.compute_default_rope_parameters
488
+ if module.rope_type[layer_type] != "default":
489
+ rope_init_fn = ROPE_INIT_FUNCTIONS[module.rope_type[layer_type]]
490
+ curr_inv_freq, _ = rope_init_fn(module.config, layer_type=layer_type)
491
+ init.copy_(getattr(module, f"{layer_type}_inv_freq"), curr_inv_freq)
492
+ init.copy_(getattr(module, f"{layer_type}_original_inv_freq"), curr_inv_freq)
485
493
 
486
494
  def _check_and_adjust_attn_implementation(self, attn_implementation, is_init_check):
487
495
  raise AttributeError("No need to inherit!")
@@ -98,7 +98,7 @@ class MoonshineRotaryEmbedding(nn.Module):
98
98
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
99
99
 
100
100
  self.register_buffer("inv_freq", inv_freq, persistent=False)
101
- self.original_inv_freq = inv_freq
101
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
102
102
 
103
103
  @staticmethod
104
104
  def compute_default_rope_parameters(
@@ -289,7 +289,7 @@ class MoshiRotaryEmbedding(nn.Module):
289
289
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
290
290
 
291
291
  self.register_buffer("inv_freq", inv_freq, persistent=False)
292
- self.original_inv_freq = inv_freq
292
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
293
293
 
294
294
  @staticmethod
295
295
  def compute_default_rope_parameters(
@@ -609,8 +609,8 @@ class MoshiFlashAttention2(MoshiAttention):
609
609
  else torch.get_autocast_gpu_dtype()
610
610
  )
611
611
  # Handle the case where the model is quantized
612
- elif hasattr(self.config, "_pre_quantization_dtype"):
613
- target_dtype = self.config._pre_quantization_dtype
612
+ elif hasattr(self.config, "quantization_config"):
613
+ target_dtype = self.config.dtype
614
614
  else:
615
615
  target_dtype = self.q_proj.weight.dtype
616
616
 
@@ -869,6 +869,8 @@ class MoshiDepthDecoder(MoshiPreTrainedModel, GenerationMixin):
869
869
  self.gradient_checkpointing = False
870
870
  self.config = config
871
871
 
872
+ self.post_init()
873
+
872
874
  def forward(
873
875
  self,
874
876
  input_ids: Optional[torch.LongTensor] = None,
@@ -2178,6 +2180,7 @@ class MoshiForConditionalGeneration(MoshiPreTrainedModel, GenerationMixin):
2178
2180
  user_delay_pattern_mask=None,
2179
2181
  moshi_delay_pattern_mask=None,
2180
2182
  kwargs_depth_decoder=None,
2183
+ is_first_iteration=False,
2181
2184
  blank_user_audio_codes: Optional[torch.FloatTensor] = None,
2182
2185
  **kwargs,
2183
2186
  ):
@@ -2189,49 +2192,21 @@ class MoshiForConditionalGeneration(MoshiPreTrainedModel, GenerationMixin):
2189
2192
  # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
2190
2193
  # (we can't check exception 3 while compiling)
2191
2194
 
2192
- if past_key_values is not None:
2193
- if (
2194
- inputs_embeds is not None # Exception 1
2195
- or cache_position[-1] >= input_ids.shape[1] # Exception 3
2196
- ):
2197
- input_ids = input_ids[:, -cache_position.shape[0] :]
2198
- elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
2199
- input_ids = input_ids[:, cache_position]
2200
-
2201
- # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
2202
- if inputs_embeds is not None and cache_position[0] == 0:
2203
- model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
2204
- else:
2205
- model_inputs = {"input_ids": input_ids, "inputs_embeds": None}
2206
-
2207
- if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
2208
- if model_inputs["inputs_embeds"] is not None:
2209
- batch_size, sequence_length, _ = inputs_embeds.shape
2210
- device = inputs_embeds.device
2211
- else:
2212
- batch_size, sequence_length = input_ids.shape
2213
- device = input_ids.device
2214
-
2215
- attention_mask = self.decoder.model._prepare_4d_causal_attention_mask_with_cache_position(
2216
- attention_mask,
2217
- sequence_length=sequence_length,
2218
- target_length=past_key_values.get_max_cache_shape(),
2219
- dtype=self.decoder.lm_head.weight.dtype,
2220
- device=device,
2221
- cache_position=cache_position,
2222
- batch_size=batch_size,
2223
- config=self.config,
2224
- past_key_values=past_key_values,
2225
- )
2226
-
2227
- model_inputs.update(
2228
- {
2229
- "position_ids": position_ids,
2230
- "past_key_values": past_key_values,
2231
- "use_cache": use_cache,
2232
- "attention_mask": attention_mask,
2233
- "cache_position": cache_position,
2234
- }
2195
+ model_inputs = super().prepare_inputs_for_generation(
2196
+ input_ids,
2197
+ past_key_values=past_key_values,
2198
+ attention_mask=attention_mask,
2199
+ inputs_embeds=inputs_embeds,
2200
+ cache_position=cache_position,
2201
+ position_ids=position_ids,
2202
+ use_cache=use_cache,
2203
+ logits_to_keep=logits_to_keep,
2204
+ user_delay_pattern_mask=user_delay_pattern_mask,
2205
+ moshi_delay_pattern_mask=moshi_delay_pattern_mask,
2206
+ kwargs_depth_decoder=kwargs_depth_decoder,
2207
+ is_first_iteration=is_first_iteration,
2208
+ blank_user_audio_codes=blank_user_audio_codes,
2209
+ **kwargs,
2235
2210
  )
2236
2211
 
2237
2212
  # 2. Now that everything is prepared, generate audio_codes using the depth decoder
@@ -2270,11 +2245,6 @@ class MoshiForConditionalGeneration(MoshiPreTrainedModel, GenerationMixin):
2270
2245
  model_inputs["input_ids"] = None
2271
2246
  model_inputs["inputs_embeds"] = inputs_embeds
2272
2247
 
2273
- # Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
2274
- for key, value in kwargs.items():
2275
- if key not in model_inputs:
2276
- model_inputs[key] = value
2277
-
2278
2248
  return model_inputs
2279
2249
 
2280
2250
  def _update_model_kwargs_for_generation(
@@ -52,6 +52,8 @@ class MPNetPreTrainedModel(PreTrainedModel):
52
52
  super()._init_weights(module)
53
53
  if isinstance(module, MPNetLMHead):
54
54
  init.zeros_(module.bias)
55
+ elif isinstance(module, MPNetEmbeddings):
56
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
55
57
 
56
58
 
57
59
  class MPNetEmbeddings(nn.Module):
@@ -54,7 +54,7 @@ def load_cuda_kernels():
54
54
  global mra_cuda_kernel
55
55
  if not is_kernels_available():
56
56
  raise ImportError("kernels is not installed, please install it with `pip install kernels`")
57
- from kernels import get_kernel
57
+ from ...integrations.hub_kernels import get_kernel
58
58
 
59
59
  mra_cuda_kernel = get_kernel("kernels-community/mra")
60
60
 
@@ -796,6 +796,9 @@ class MraPreTrainedModel(PreTrainedModel):
796
796
  super()._init_weights(module)
797
797
  if isinstance(module, MraLMPredictionHead):
798
798
  init.zeros_(module.bias)
799
+ elif isinstance(module, MraEmbeddings):
800
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)) + 2)
801
+ init.zeros_(module.token_type_ids)
799
802
 
800
803
 
801
804
  @auto_docstring
@@ -133,17 +133,16 @@ class MT5Config(PreTrainedConfig):
133
133
  if feed_forward_proj == "gated-gelu":
134
134
  self.dense_act_fn = "gelu_new"
135
135
 
136
+ # Force because official weights have False serialized, but we have to tie always
137
+ kwargs["tie_word_embeddings"] = True
136
138
  super().__init__(
137
139
  is_encoder_decoder=is_encoder_decoder,
138
140
  tokenizer_class=tokenizer_class,
139
- tie_word_embeddings=tie_word_embeddings,
140
141
  pad_token_id=pad_token_id,
141
142
  eos_token_id=eos_token_id,
142
143
  decoder_start_token_id=decoder_start_token_id,
143
144
  **kwargs,
144
145
  )
145
- # TODO: Mt5 never supported not tying encoder decoder so this has to be true.
146
- self.tie_encoder_decoder = True
147
146
 
148
147
 
149
148
  __all__ = ["MT5Config"]
@@ -860,12 +860,10 @@ class MT5Model(MT5PreTrainedModel):
860
860
  encoder_config = copy.deepcopy(config)
861
861
  encoder_config.is_decoder = False
862
862
  encoder_config.use_cache = False
863
- encoder_config.tie_encoder_decoder = False
864
863
  self.encoder = MT5Stack(encoder_config)
865
864
 
866
865
  decoder_config = copy.deepcopy(config)
867
866
  decoder_config.is_decoder = True
868
- decoder_config.tie_encoder_decoder = False
869
867
  decoder_config.num_layers = config.num_decoder_layers
870
868
  self.decoder = MT5Stack(decoder_config)
871
869
 
@@ -1043,12 +1041,10 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
1043
1041
  encoder_config = copy.deepcopy(config)
1044
1042
  encoder_config.is_decoder = False
1045
1043
  encoder_config.use_cache = False
1046
- encoder_config.tie_encoder_decoder = False
1047
1044
  self.encoder = MT5Stack(encoder_config)
1048
1045
 
1049
1046
  decoder_config = copy.deepcopy(config)
1050
1047
  decoder_config.is_decoder = True
1051
- decoder_config.tie_encoder_decoder = False
1052
1048
  decoder_config.num_layers = config.num_decoder_layers
1053
1049
  self.decoder = MT5Stack(decoder_config)
1054
1050
 
@@ -1066,7 +1062,6 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
1066
1062
  self.decoder.set_input_embeddings(new_embeddings)
1067
1063
 
1068
1064
  @auto_docstring
1069
- # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward with google-t5/->google/, T5->MT5, t5->mt5
1070
1065
  def forward(
1071
1066
  self,
1072
1067
  input_ids: Optional[torch.LongTensor] = None,
@@ -1184,9 +1179,6 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
1184
1179
 
1185
1180
  sequence_output = decoder_outputs[0]
1186
1181
 
1187
- if self.config.tie_word_embeddings:
1188
- sequence_output = sequence_output * (self.model_dim**-0.5)
1189
-
1190
1182
  lm_logits = self.lm_head(sequence_output)
1191
1183
 
1192
1184
  loss = None
@@ -1551,12 +1543,10 @@ class MT5ForQuestionAnswering(MT5PreTrainedModel):
1551
1543
  encoder_config = copy.deepcopy(config)
1552
1544
  encoder_config.is_decoder = False
1553
1545
  encoder_config.use_cache = False
1554
- encoder_config.tie_encoder_decoder = False
1555
1546
  self.encoder = MT5Stack(encoder_config)
1556
1547
 
1557
1548
  decoder_config = copy.deepcopy(config)
1558
1549
  decoder_config.is_decoder = True
1559
- decoder_config.tie_encoder_decoder = False
1560
1550
  decoder_config.num_layers = config.num_decoder_layers
1561
1551
  self.decoder = MT5Stack(decoder_config)
1562
1552
 
@@ -117,6 +117,7 @@ class MusicgenSinusoidalPositionalEmbedding(nn.Module):
117
117
  def __init__(self, num_positions: int, embedding_dim: int):
118
118
  super().__init__()
119
119
  self.embedding_dim = embedding_dim
120
+ self.num_positions = num_positions
120
121
  self.make_weights(num_positions, embedding_dim)
121
122
 
122
123
  def make_weights(self, num_embeddings: int, embedding_dim: int):
@@ -432,6 +433,9 @@ class MusicgenPreTrainedModel(PreTrainedModel):
432
433
  # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
433
434
  if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
434
435
  init.zeros_(module.weight[module.padding_idx])
436
+ elif isinstance(module, MusicgenSinusoidalPositionalEmbedding):
437
+ emb_weights = module.get_embedding(module.num_positions, module.embedding_dim)
438
+ init.copy_(module.weights, emb_weights)
435
439
 
436
440
 
437
441
  class MusicgenDecoder(MusicgenPreTrainedModel):
@@ -2082,7 +2086,6 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
2082
2086
  stopping_criteria: Optional[StoppingCriteriaList] = None,
2083
2087
  synced_gpus: Optional[bool] = None,
2084
2088
  streamer: Optional["BaseStreamer"] = None,
2085
- use_model_defaults: Optional[bool] = None,
2086
2089
  **kwargs,
2087
2090
  ):
2088
2091
  """
@@ -2127,11 +2130,6 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
2127
2130
  streamer (`BaseStreamer`, *optional*):
2128
2131
  Streamer object that will be used to stream the generated sequences. Generated tokens are passed
2129
2132
  through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
2130
- use_model_defaults (`bool`, *optional*):
2131
- When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
2132
- generation configuration (`model.generation_config`), as opposed to the global defaults
2133
- (`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
2134
- `True`.
2135
2133
  kwargs (`dict[str, Any]`, *optional*):
2136
2134
  Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
2137
2135
  forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
@@ -2155,9 +2153,7 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
2155
2153
  """
2156
2154
  # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
2157
2155
  generation_mode_kwargs = self._extract_generation_mode_kwargs(None, kwargs, False, None, None)
2158
- generation_config, model_kwargs = self._prepare_generation_config(
2159
- generation_config, use_model_defaults, **kwargs
2160
- )
2156
+ generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
2161
2157
  generation_mode = generation_config.get_generation_mode()
2162
2158
  if generation_mode not in [GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH]:
2163
2159
  raise ValueError(
@@ -122,6 +122,7 @@ class MusicgenMelodySinusoidalPositionalEmbedding(nn.Module):
122
122
  def __init__(self, num_positions: int, embedding_dim: int):
123
123
  super().__init__()
124
124
  self.embedding_dim = embedding_dim
125
+ self.num_positions = num_positions
125
126
  self.make_weights(num_positions, embedding_dim)
126
127
 
127
128
  def make_weights(self, num_embeddings: int, embedding_dim: int):
@@ -403,6 +404,9 @@ class MusicgenMelodyPreTrainedModel(PreTrainedModel):
403
404
  # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
404
405
  if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
405
406
  init.zeros_(module.weight[module.padding_idx])
407
+ elif isinstance(module, MusicgenMelodySinusoidalPositionalEmbedding):
408
+ emb_weights = module.get_embedding(module.num_positions, module.embedding_dim)
409
+ init.copy_(module.weights, emb_weights)
406
410
 
407
411
 
408
412
  # Copied from transformers.models.musicgen.modeling_musicgen.MusicgenDecoder with MUSICGEN->MUSICGEN_MELODY,Musicgen->MusicgenMelody
@@ -21,6 +21,7 @@ import torch
21
21
  from torch import nn
22
22
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
23
23
 
24
+ from ... import initialization as init
24
25
  from ...activations import ACT2FN
25
26
  from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
26
27
  from ...generation import GenerationMixin
@@ -469,6 +470,11 @@ class MvpPreTrainedModel(PreTrainedModel):
469
470
  base_model_prefix = "model"
470
471
  supports_gradient_checkpointing = True
471
472
 
473
+ def _init_weights(self, module):
474
+ super()._init_weights(module)
475
+ if isinstance(module, MvpForConditionalGeneration):
476
+ init.zeros_(module.final_logits_bias)
477
+
472
478
  @property
473
479
  def dummy_inputs(self):
474
480
  pad_token = self.config.pad_token_id
@@ -1509,6 +1515,7 @@ class MvpDecoderWrapper(MvpPreTrainedModel):
1509
1515
  def __init__(self, config):
1510
1516
  super().__init__(config)
1511
1517
  self.decoder = MvpDecoder(config)
1518
+ self.post_init()
1512
1519
 
1513
1520
  def forward(self, *args, **kwargs):
1514
1521
  return self.decoder(*args, **kwargs)
@@ -74,7 +74,7 @@ class NanoChatRotaryEmbedding(nn.Module):
74
74
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
75
75
 
76
76
  self.register_buffer("inv_freq", inv_freq, persistent=False)
77
- self.original_inv_freq = inv_freq
77
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
78
78
 
79
79
  @staticmethod
80
80
  def compute_default_rope_parameters(
@@ -110,7 +110,7 @@ class NemotronRotaryEmbedding(nn.Module):
110
110
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
111
111
 
112
112
  self.register_buffer("inv_freq", inv_freq, persistent=False)
113
- self.original_inv_freq = inv_freq
113
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
114
114
 
115
115
  @staticmethod
116
116
  # Ignore copy
@@ -397,8 +397,8 @@ class NemotronFlashAttention2(NemotronAttention):
397
397
  else torch.get_autocast_gpu_dtype()
398
398
  )
399
399
  # Handle the case where the model is quantized
400
- elif hasattr(self.config, "_pre_quantization_dtype"):
401
- target_dtype = self.config._pre_quantization_dtype
400
+ elif hasattr(self.config, "quantization_config"):
401
+ target_dtype = self.config.dtype
402
402
  else:
403
403
  target_dtype = self.q_proj.weight.dtype
404
404
 
@@ -206,6 +206,7 @@ class NllbMoeConfig(PreTrainedConfig):
206
206
  self.moe_eval_capacity_token_fraction = moe_eval_capacity_token_fraction
207
207
  self.moe_token_dropout = moe_token_dropout
208
208
  self.output_router_logits = output_router_logits
209
+
209
210
  super().__init__(
210
211
  pad_token_id=pad_token_id,
211
212
  bos_token_id=bos_token_id,
@@ -21,6 +21,7 @@ import torch
21
21
  import torch.nn as nn
22
22
  from torch.nn import CrossEntropyLoss
23
23
 
24
+ from ... import initialization as init
24
25
  from ...activations import ACT2FN
25
26
  from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
26
27
  from ...generation import GenerationMixin
@@ -66,6 +67,7 @@ class NllbMoeSinusoidalPositionalEmbedding(nn.Module):
66
67
  def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
67
68
  super().__init__()
68
69
  self.offset = 2
70
+ self.num_positions = num_positions
69
71
  self.embedding_dim = embedding_dim
70
72
  self.padding_idx = padding_idx
71
73
  self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
@@ -665,6 +667,14 @@ class NllbMoePreTrainedModel(PreTrainedModel):
665
667
  _supports_sdpa = False
666
668
  _supports_flex_attn = False
667
669
 
670
+ def _init_weights(self, module):
671
+ super()._init_weights(module)
672
+ if isinstance(module, NllbMoeSinusoidalPositionalEmbedding):
673
+ emb_weights = module.get_embedding(
674
+ module.num_positions + module.offset, module.embedding_dim, module.padding_idx
675
+ )
676
+ init.copy_(module.weights, emb_weights)
677
+
668
678
 
669
679
  class NllbMoeEncoder(NllbMoePreTrainedModel):
670
680
  _can_record_outputs = {
@@ -290,7 +290,6 @@ class NougatImageProcessorFast(BaseImageProcessorFast):
290
290
  processed_images_grouped[shape] = stacked_images
291
291
 
292
292
  processed_images = reorder_images(processed_images_grouped, grouped_images_index)
293
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
294
293
 
295
294
  return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
296
295
 
@@ -441,31 +441,26 @@ class NougatTokenizer(TokenizersBackend):
441
441
  )
442
442
  self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)
443
443
 
444
- # Set up post processor with bos and eos tokens
445
- bos_token_id = self._vocab.get(str(bos_token), 0)
446
- eos_token_id = self._vocab.get(str(eos_token), 2)
447
- pad_token_id = self._vocab.get(str(pad_token), 1)
444
+ super().__init__(
445
+ errors=errors,
446
+ unk_token=unk_token,
447
+ bos_token=bos_token,
448
+ eos_token=eos_token,
449
+ pad_token=pad_token,
450
+ **kwargs,
451
+ )
448
452
  self._tokenizer.post_processor = processors.TemplateProcessing(
449
453
  single=f"{bos_token}:0 $A:0 {eos_token}:0",
450
454
  pair="$A:0 $B:1",
451
455
  special_tokens=[
452
- (str(eos_token), eos_token_id),
453
- (str(bos_token), bos_token_id),
456
+ (str(eos_token), self.eos_token_id),
457
+ (str(bos_token), self.bos_token_id),
454
458
  ],
455
459
  )
456
460
 
457
461
  # Enable truncation and padding
458
462
  self._tokenizer.enable_truncation(max_length=4096)
459
- self._tokenizer.enable_padding(length=4096, pad_id=pad_token_id, pad_token=str(pad_token))
460
-
461
- super().__init__(
462
- errors=errors,
463
- unk_token=unk_token,
464
- bos_token=bos_token,
465
- eos_token=eos_token,
466
- pad_token=pad_token,
467
- **kwargs,
468
- )
463
+ self._tokenizer.enable_padding(length=4096, pad_id=self.pad_token_id, pad_token=str(pad_token))
469
464
 
470
465
  def remove_hallucinated_references(self, text: str) -> str:
471
466
  """