transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (671) hide show
  1. transformers/__init__.py +20 -1
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/configuration_utils.py +114 -70
  6. transformers/conversion_mapping.py +68 -5
  7. transformers/core_model_loading.py +201 -35
  8. transformers/dependency_versions_table.py +1 -1
  9. transformers/feature_extraction_utils.py +54 -22
  10. transformers/generation/candidate_generator.py +79 -31
  11. transformers/generation/configuration_utils.py +162 -122
  12. transformers/generation/continuous_batching/cache.py +47 -18
  13. transformers/generation/continuous_batching/cache_manager.py +131 -34
  14. transformers/generation/continuous_batching/continuous_api.py +101 -64
  15. transformers/generation/continuous_batching/requests.py +28 -1
  16. transformers/generation/continuous_batching/scheduler.py +11 -4
  17. transformers/generation/stopping_criteria.py +1 -1
  18. transformers/generation/utils.py +108 -110
  19. transformers/generation/watermarking.py +8 -5
  20. transformers/image_processing_base.py +2 -12
  21. transformers/image_processing_utils_fast.py +15 -4
  22. transformers/initialization.py +37 -0
  23. transformers/integrations/__init__.py +12 -0
  24. transformers/integrations/accelerate.py +44 -111
  25. transformers/integrations/aqlm.py +3 -5
  26. transformers/integrations/awq.py +2 -5
  27. transformers/integrations/bitnet.py +5 -8
  28. transformers/integrations/bitsandbytes.py +16 -15
  29. transformers/integrations/deepspeed.py +18 -3
  30. transformers/integrations/eetq.py +3 -5
  31. transformers/integrations/fbgemm_fp8.py +1 -1
  32. transformers/integrations/finegrained_fp8.py +6 -16
  33. transformers/integrations/flash_attention.py +2 -2
  34. transformers/integrations/higgs.py +2 -5
  35. transformers/integrations/hub_kernels.py +23 -5
  36. transformers/integrations/integration_utils.py +35 -0
  37. transformers/integrations/mistral.py +12 -0
  38. transformers/integrations/moe.py +240 -0
  39. transformers/integrations/mxfp4.py +4 -10
  40. transformers/integrations/peft.py +5 -0
  41. transformers/integrations/quanto.py +5 -2
  42. transformers/integrations/spqr.py +3 -5
  43. transformers/integrations/tensor_parallel.py +167 -221
  44. transformers/integrations/vptq.py +3 -5
  45. transformers/modeling_gguf_pytorch_utils.py +66 -19
  46. transformers/modeling_rope_utils.py +78 -81
  47. transformers/modeling_utils.py +583 -503
  48. transformers/models/__init__.py +19 -0
  49. transformers/models/afmoe/modeling_afmoe.py +7 -16
  50. transformers/models/afmoe/modular_afmoe.py +5 -13
  51. transformers/models/aimv2/modeling_aimv2.py +4 -0
  52. transformers/models/aimv2/modular_aimv2.py +4 -0
  53. transformers/models/albert/modeling_albert.py +3 -0
  54. transformers/models/align/modeling_align.py +12 -6
  55. transformers/models/altclip/modeling_altclip.py +7 -3
  56. transformers/models/apertus/modeling_apertus.py +4 -2
  57. transformers/models/apertus/modular_apertus.py +4 -1
  58. transformers/models/arcee/modeling_arcee.py +1 -1
  59. transformers/models/aria/modeling_aria.py +8 -4
  60. transformers/models/aria/modular_aria.py +7 -3
  61. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  62. transformers/models/auto/auto_factory.py +1 -1
  63. transformers/models/auto/configuration_auto.py +27 -0
  64. transformers/models/auto/feature_extraction_auto.py +7 -3
  65. transformers/models/auto/image_processing_auto.py +4 -2
  66. transformers/models/auto/modeling_auto.py +31 -0
  67. transformers/models/auto/processing_auto.py +4 -0
  68. transformers/models/auto/tokenization_auto.py +132 -153
  69. transformers/models/auto/video_processing_auto.py +5 -2
  70. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  71. transformers/models/bamba/modeling_bamba.py +18 -19
  72. transformers/models/bamba/modular_bamba.py +17 -16
  73. transformers/models/bark/modeling_bark.py +9 -0
  74. transformers/models/bart/configuration_bart.py +0 -1
  75. transformers/models/bart/modeling_bart.py +7 -0
  76. transformers/models/beit/image_processing_beit_fast.py +0 -1
  77. transformers/models/bert/modeling_bert.py +3 -0
  78. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  79. transformers/models/big_bird/modeling_big_bird.py +3 -0
  80. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
  81. transformers/models/bit/modeling_bit.py +5 -1
  82. transformers/models/bitnet/modeling_bitnet.py +1 -1
  83. transformers/models/blenderbot/modeling_blenderbot.py +7 -0
  84. transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
  85. transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
  86. transformers/models/blip/modeling_blip.py +2 -0
  87. transformers/models/blip/modeling_blip_text.py +8 -0
  88. transformers/models/blip_2/modeling_blip_2.py +2 -0
  89. transformers/models/bloom/modeling_bloom.py +13 -44
  90. transformers/models/blt/modeling_blt.py +162 -2
  91. transformers/models/blt/modular_blt.py +168 -3
  92. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  93. transformers/models/bridgetower/modeling_bridgetower.py +6 -0
  94. transformers/models/bros/modeling_bros.py +8 -0
  95. transformers/models/camembert/modeling_camembert.py +109 -106
  96. transformers/models/canine/modeling_canine.py +6 -0
  97. transformers/models/canine/tokenization_canine.py +2 -0
  98. transformers/models/chameleon/modeling_chameleon.py +9 -4
  99. transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
  100. transformers/models/clap/feature_extraction_clap.py +2 -2
  101. transformers/models/clap/modeling_clap.py +25 -15
  102. transformers/models/clip/modeling_clip.py +2 -0
  103. transformers/models/clipseg/modeling_clipseg.py +4 -0
  104. transformers/models/clvp/modeling_clvp.py +14 -3
  105. transformers/models/code_llama/tokenization_code_llama.py +1 -1
  106. transformers/models/codegen/modeling_codegen.py +13 -4
  107. transformers/models/cohere/modeling_cohere.py +1 -1
  108. transformers/models/cohere2/modeling_cohere2.py +1 -1
  109. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
  110. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  111. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  112. transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
  113. transformers/models/convbert/modeling_convbert.py +3 -0
  114. transformers/models/convnext/image_processing_convnext.py +2 -2
  115. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  116. transformers/models/csm/generation_csm.py +19 -22
  117. transformers/models/csm/modeling_csm.py +3 -1
  118. transformers/models/csm/modular_csm.py +2 -0
  119. transformers/models/ctrl/modeling_ctrl.py +14 -2
  120. transformers/models/cvt/modeling_cvt.py +5 -1
  121. transformers/models/cwm/modeling_cwm.py +1 -1
  122. transformers/models/d_fine/configuration_d_fine.py +3 -4
  123. transformers/models/d_fine/modeling_d_fine.py +46 -39
  124. transformers/models/d_fine/modular_d_fine.py +15 -4
  125. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  126. transformers/models/dab_detr/modeling_dab_detr.py +1 -1
  127. transformers/models/dac/modeling_dac.py +4 -4
  128. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  129. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  130. transformers/models/dbrx/configuration_dbrx.py +9 -1
  131. transformers/models/dbrx/modeling_dbrx.py +1 -1
  132. transformers/models/deberta/modeling_deberta.py +2 -0
  133. transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
  134. transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
  135. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
  136. transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
  137. transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
  138. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
  139. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  140. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  141. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  142. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  143. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  144. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  145. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  146. transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
  147. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  148. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  149. transformers/models/detr/configuration_detr.py +1 -1
  150. transformers/models/detr/modeling_detr.py +8 -1
  151. transformers/models/dia/generation_dia.py +3 -10
  152. transformers/models/dia/modeling_dia.py +12 -1
  153. transformers/models/dia/modular_dia.py +11 -0
  154. transformers/models/dia/processing_dia.py +1 -1
  155. transformers/models/diffllama/modeling_diffllama.py +3 -3
  156. transformers/models/diffllama/modular_diffllama.py +2 -2
  157. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  158. transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
  159. transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
  160. transformers/models/distilbert/modeling_distilbert.py +11 -9
  161. transformers/models/doge/modeling_doge.py +1 -1
  162. transformers/models/donut/image_processing_donut_fast.py +0 -1
  163. transformers/models/donut/modeling_donut_swin.py +16 -12
  164. transformers/models/dots1/modeling_dots1.py +14 -5
  165. transformers/models/dpt/configuration_dpt.py +1 -1
  166. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  167. transformers/models/dpt/modular_dpt.py +1 -2
  168. transformers/models/edgetam/configuration_edgetam.py +1 -1
  169. transformers/models/edgetam/modeling_edgetam.py +5 -2
  170. transformers/models/edgetam/modular_edgetam.py +15 -14
  171. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
  172. transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
  173. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  174. transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
  175. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  176. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  177. transformers/models/efficientnet/modeling_efficientnet.py +5 -1
  178. transformers/models/electra/modeling_electra.py +7 -0
  179. transformers/models/emu3/modeling_emu3.py +8 -2
  180. transformers/models/emu3/modular_emu3.py +7 -1
  181. transformers/models/encodec/modeling_encodec.py +14 -0
  182. transformers/models/eomt/image_processing_eomt_fast.py +46 -14
  183. transformers/models/eomt/modeling_eomt.py +7 -0
  184. transformers/models/eomt/modular_eomt.py +7 -0
  185. transformers/models/ernie/modeling_ernie.py +6 -0
  186. transformers/models/ernie/modular_ernie.py +6 -0
  187. transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
  188. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
  189. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
  190. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  191. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  192. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  193. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  194. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  195. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  196. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  197. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  198. transformers/models/esm/modeling_esm.py +6 -0
  199. transformers/models/esm/modeling_esmfold.py +6 -1
  200. transformers/models/evolla/modeling_evolla.py +9 -1
  201. transformers/models/evolla/modular_evolla.py +8 -0
  202. transformers/models/exaone4/modeling_exaone4.py +1 -1
  203. transformers/models/falcon/modeling_falcon.py +3 -3
  204. transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
  205. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  206. transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
  207. transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
  208. transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
  209. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
  210. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  211. transformers/models/flaubert/modeling_flaubert.py +14 -15
  212. transformers/models/flava/image_processing_flava_fast.py +0 -2
  213. transformers/models/flava/modeling_flava.py +4 -1
  214. transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
  215. transformers/models/florence2/modeling_florence2.py +20 -3
  216. transformers/models/florence2/modular_florence2.py +13 -0
  217. transformers/models/fnet/modeling_fnet.py +7 -0
  218. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  219. transformers/models/fuyu/modeling_fuyu.py +3 -1
  220. transformers/models/fuyu/processing_fuyu.py +16 -0
  221. transformers/models/gemma/modeling_gemma.py +10 -12
  222. transformers/models/gemma/modular_gemma.py +9 -11
  223. transformers/models/gemma2/modeling_gemma2.py +1 -1
  224. transformers/models/gemma2/modular_gemma2.py +1 -1
  225. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  226. transformers/models/gemma3/modeling_gemma3.py +28 -7
  227. transformers/models/gemma3/modular_gemma3.py +26 -6
  228. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  229. transformers/models/gemma3n/modeling_gemma3n.py +47 -9
  230. transformers/models/gemma3n/modular_gemma3n.py +51 -9
  231. transformers/models/git/modeling_git.py +181 -126
  232. transformers/models/glm/modeling_glm.py +1 -1
  233. transformers/models/glm4/modeling_glm4.py +1 -1
  234. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  235. transformers/models/glm46v/modeling_glm46v.py +3 -1
  236. transformers/models/glm46v/modular_glm46v.py +3 -0
  237. transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
  238. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  239. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  240. transformers/models/glm4v/modeling_glm4v.py +15 -5
  241. transformers/models/glm4v/modular_glm4v.py +11 -3
  242. transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
  243. transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
  244. transformers/models/glmasr/__init__.py +30 -0
  245. transformers/models/glmasr/configuration_glmasr.py +197 -0
  246. transformers/models/glmasr/modeling_glmasr.py +512 -0
  247. transformers/models/glmasr/modular_glmasr.py +433 -0
  248. transformers/models/glmasr/processing_glmasr.py +332 -0
  249. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  250. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  251. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  252. transformers/models/gpt2/modeling_gpt2.py +8 -5
  253. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
  254. transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
  255. transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
  256. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
  257. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  258. transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
  259. transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
  260. transformers/models/gptj/modeling_gptj.py +15 -6
  261. transformers/models/granite/modeling_granite.py +1 -1
  262. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  263. transformers/models/granitemoe/modeling_granitemoe.py +2 -3
  264. transformers/models/granitemoe/modular_granitemoe.py +1 -2
  265. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  266. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
  267. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  268. transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
  269. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  270. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
  271. transformers/models/groupvit/modeling_groupvit.py +6 -1
  272. transformers/models/helium/modeling_helium.py +1 -1
  273. transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
  274. transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
  275. transformers/models/hubert/modeling_hubert.py +4 -0
  276. transformers/models/hubert/modular_hubert.py +4 -0
  277. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
  278. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  279. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  280. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
  281. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  282. transformers/models/ibert/modeling_ibert.py +16 -0
  283. transformers/models/idefics/modeling_idefics.py +10 -0
  284. transformers/models/idefics2/modeling_idefics2.py +7 -1
  285. transformers/models/idefics3/modeling_idefics3.py +5 -1
  286. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  287. transformers/models/imagegpt/modeling_imagegpt.py +9 -2
  288. transformers/models/instructblip/modeling_instructblip.py +2 -0
  289. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  290. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  291. transformers/models/internvl/modeling_internvl.py +11 -8
  292. transformers/models/internvl/modular_internvl.py +5 -9
  293. transformers/models/internvl/video_processing_internvl.py +0 -1
  294. transformers/models/jais2/__init__.py +27 -0
  295. transformers/models/jais2/configuration_jais2.py +152 -0
  296. transformers/models/jais2/modeling_jais2.py +486 -0
  297. transformers/models/jais2/modular_jais2.py +196 -0
  298. transformers/models/jamba/modeling_jamba.py +24 -19
  299. transformers/models/jamba/modular_jamba.py +17 -17
  300. transformers/models/janus/image_processing_janus_fast.py +0 -1
  301. transformers/models/janus/modeling_janus.py +15 -7
  302. transformers/models/janus/modular_janus.py +16 -7
  303. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  304. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  305. transformers/models/kosmos2/modeling_kosmos2.py +14 -2
  306. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  307. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  308. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
  309. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  310. transformers/models/lasr/configuration_lasr.py +4 -0
  311. transformers/models/lasr/modeling_lasr.py +3 -2
  312. transformers/models/lasr/modular_lasr.py +8 -1
  313. transformers/models/lasr/processing_lasr.py +0 -2
  314. transformers/models/layoutlm/modeling_layoutlm.py +5 -3
  315. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  316. transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
  317. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
  318. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  319. transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
  320. transformers/models/led/modeling_led.py +6 -0
  321. transformers/models/levit/modeling_levit.py +18 -0
  322. transformers/models/lfm2/modeling_lfm2.py +1 -1
  323. transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
  324. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  325. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  326. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  327. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  328. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  329. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  330. transformers/models/lilt/modeling_lilt.py +19 -15
  331. transformers/models/llama/modeling_llama.py +1 -1
  332. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  333. transformers/models/llama4/modeling_llama4.py +8 -4
  334. transformers/models/llava/image_processing_llava_fast.py +0 -1
  335. transformers/models/llava/modeling_llava.py +12 -7
  336. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  337. transformers/models/llava_next/modeling_llava_next.py +7 -3
  338. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  339. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  340. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  341. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  342. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  343. transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
  344. transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
  345. transformers/models/longt5/modeling_longt5.py +0 -4
  346. transformers/models/m2m_100/modeling_m2m_100.py +10 -0
  347. transformers/models/mamba/modeling_mamba.py +2 -1
  348. transformers/models/mamba2/modeling_mamba2.py +24 -23
  349. transformers/models/marian/configuration_marian.py +1 -1
  350. transformers/models/marian/modeling_marian.py +3 -0
  351. transformers/models/markuplm/modeling_markuplm.py +5 -8
  352. transformers/models/mask2former/configuration_mask2former.py +3 -3
  353. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  354. transformers/models/mask2former/modeling_mask2former.py +9 -0
  355. transformers/models/maskformer/configuration_maskformer.py +3 -3
  356. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  357. transformers/models/maskformer/modeling_maskformer.py +9 -1
  358. transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
  359. transformers/models/mbart/configuration_mbart.py +1 -0
  360. transformers/models/mbart/modeling_mbart.py +7 -0
  361. transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
  362. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  363. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  364. transformers/models/mimi/modeling_mimi.py +25 -4
  365. transformers/models/minimax/modeling_minimax.py +16 -3
  366. transformers/models/minimax/modular_minimax.py +12 -1
  367. transformers/models/ministral/modeling_ministral.py +1 -1
  368. transformers/models/ministral3/modeling_ministral3.py +1 -1
  369. transformers/models/mistral/modeling_mistral.py +1 -1
  370. transformers/models/mistral3/modeling_mistral3.py +10 -4
  371. transformers/models/mistral3/modular_mistral3.py +3 -1
  372. transformers/models/mixtral/modeling_mixtral.py +12 -4
  373. transformers/models/mixtral/modular_mixtral.py +6 -2
  374. transformers/models/mlcd/modeling_mlcd.py +6 -0
  375. transformers/models/mlcd/modular_mlcd.py +4 -0
  376. transformers/models/mllama/modeling_mllama.py +13 -2
  377. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  378. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
  379. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  380. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  381. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  382. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  383. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  384. transformers/models/mobilevit/modeling_mobilevit.py +4 -0
  385. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
  386. transformers/models/modernbert/modeling_modernbert.py +12 -1
  387. transformers/models/modernbert/modular_modernbert.py +12 -1
  388. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
  389. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
  390. transformers/models/moonshine/modeling_moonshine.py +1 -1
  391. transformers/models/moshi/modeling_moshi.py +21 -51
  392. transformers/models/mpnet/modeling_mpnet.py +2 -0
  393. transformers/models/mra/modeling_mra.py +4 -1
  394. transformers/models/mt5/configuration_mt5.py +2 -3
  395. transformers/models/mt5/modeling_mt5.py +0 -10
  396. transformers/models/musicgen/modeling_musicgen.py +5 -9
  397. transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
  398. transformers/models/mvp/modeling_mvp.py +7 -0
  399. transformers/models/nanochat/modeling_nanochat.py +1 -1
  400. transformers/models/nemotron/modeling_nemotron.py +3 -3
  401. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  402. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  403. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  404. transformers/models/nougat/tokenization_nougat.py +11 -16
  405. transformers/models/nystromformer/modeling_nystromformer.py +7 -0
  406. transformers/models/olmo/modeling_olmo.py +1 -1
  407. transformers/models/olmo2/modeling_olmo2.py +1 -1
  408. transformers/models/olmo3/modeling_olmo3.py +1 -1
  409. transformers/models/olmoe/modeling_olmoe.py +12 -4
  410. transformers/models/olmoe/modular_olmoe.py +4 -2
  411. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  412. transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
  413. transformers/models/oneformer/configuration_oneformer.py +3 -3
  414. transformers/models/oneformer/modeling_oneformer.py +7 -38
  415. transformers/models/openai/modeling_openai.py +12 -0
  416. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  417. transformers/models/ovis2/modeling_ovis2.py +15 -3
  418. transformers/models/ovis2/modular_ovis2.py +8 -0
  419. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  420. transformers/models/owlv2/modeling_owlv2.py +7 -3
  421. transformers/models/owlv2/modular_owlv2.py +0 -2
  422. transformers/models/owlvit/modeling_owlvit.py +7 -3
  423. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
  424. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
  425. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
  426. transformers/models/paligemma/modeling_paligemma.py +25 -17
  427. transformers/models/parakeet/modeling_parakeet.py +5 -0
  428. transformers/models/parakeet/modular_parakeet.py +5 -0
  429. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  430. transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
  431. transformers/models/patchtst/modeling_patchtst.py +5 -4
  432. transformers/models/pe_audio/__init__.py +30 -0
  433. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  434. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  435. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  436. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  437. transformers/models/pe_audio/processing_pe_audio.py +24 -0
  438. transformers/models/pe_audio_video/__init__.py +29 -0
  439. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  440. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  441. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  442. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  443. transformers/models/pe_video/__init__.py +30 -0
  444. transformers/models/pe_video/configuration_pe_video.py +211 -0
  445. transformers/models/pe_video/modeling_pe_video.py +636 -0
  446. transformers/models/pe_video/modular_pe_video.py +219 -0
  447. transformers/models/pe_video/processing_pe_video.py +10 -0
  448. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  449. transformers/models/pegasus/configuration_pegasus.py +1 -0
  450. transformers/models/pegasus/modeling_pegasus.py +3 -0
  451. transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
  452. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  453. transformers/models/perceiver/modeling_perceiver.py +5 -1
  454. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  455. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  456. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  457. transformers/models/persimmon/modeling_persimmon.py +1 -1
  458. transformers/models/phi/modeling_phi.py +1 -1
  459. transformers/models/phi3/modeling_phi3.py +1 -1
  460. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
  461. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
  462. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  463. transformers/models/phimoe/modeling_phimoe.py +12 -4
  464. transformers/models/phimoe/modular_phimoe.py +1 -1
  465. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  466. transformers/models/pixio/__init__.py +30 -0
  467. transformers/models/pixio/configuration_pixio.py +151 -0
  468. transformers/models/pixio/modeling_pixio.py +507 -0
  469. transformers/models/pixio/modular_pixio.py +404 -0
  470. transformers/models/pixtral/modeling_pixtral.py +1 -1
  471. transformers/models/pixtral/processing_pixtral.py +3 -1
  472. transformers/models/plbart/configuration_plbart.py +1 -0
  473. transformers/models/plbart/modeling_plbart.py +7 -0
  474. transformers/models/plbart/modular_plbart.py +6 -0
  475. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  476. transformers/models/poolformer/modeling_poolformer.py +11 -1
  477. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  478. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  479. transformers/models/prophetnet/modeling_prophetnet.py +2 -1
  480. transformers/models/qwen2/modeling_qwen2.py +1 -1
  481. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
  482. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
  483. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
  484. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
  485. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
  486. transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
  487. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  488. transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
  489. transformers/models/qwen3/modeling_qwen3.py +1 -1
  490. transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
  491. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
  492. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  493. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
  494. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
  495. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  496. transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
  497. transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
  498. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  499. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
  500. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
  501. transformers/models/rag/configuration_rag.py +0 -8
  502. transformers/models/rag/modeling_rag.py +7 -9
  503. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
  504. transformers/models/reformer/modeling_reformer.py +9 -1
  505. transformers/models/regnet/modeling_regnet.py +4 -0
  506. transformers/models/rembert/modeling_rembert.py +7 -1
  507. transformers/models/resnet/modeling_resnet.py +8 -3
  508. transformers/models/roberta/modeling_roberta.py +3 -0
  509. transformers/models/roberta/modular_roberta.py +3 -0
  510. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  511. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  512. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  513. transformers/models/rt_detr/modeling_rt_detr.py +4 -0
  514. transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
  515. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  516. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
  517. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  518. transformers/models/rwkv/modeling_rwkv.py +1 -1
  519. transformers/models/sam/configuration_sam.py +1 -0
  520. transformers/models/sam/image_processing_sam_fast.py +0 -1
  521. transformers/models/sam/modeling_sam.py +4 -1
  522. transformers/models/sam2/configuration_sam2.py +1 -1
  523. transformers/models/sam2/modeling_sam2.py +5 -1
  524. transformers/models/sam2/modular_sam2.py +5 -1
  525. transformers/models/sam2_video/modeling_sam2_video.py +51 -43
  526. transformers/models/sam2_video/modular_sam2_video.py +31 -18
  527. transformers/models/sam3/configuration_sam3.py +21 -1
  528. transformers/models/sam3/modeling_sam3.py +23 -0
  529. transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
  530. transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
  531. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  532. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
  533. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  534. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  535. transformers/models/sam3_video/modeling_sam3_video.py +3 -3
  536. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  537. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  538. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  539. transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
  540. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
  541. transformers/models/seed_oss/modeling_seed_oss.py +1 -1
  542. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  543. transformers/models/segformer/modeling_segformer.py +2 -2
  544. transformers/models/segformer/modular_segformer.py +0 -1
  545. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  546. transformers/models/siglip/modeling_siglip.py +24 -2
  547. transformers/models/siglip2/modeling_siglip2.py +63 -41
  548. transformers/models/smollm3/modeling_smollm3.py +1 -1
  549. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  550. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  551. transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
  552. transformers/models/speecht5/modeling_speecht5.py +28 -0
  553. transformers/models/splinter/modeling_splinter.py +9 -3
  554. transformers/models/squeezebert/modeling_squeezebert.py +2 -0
  555. transformers/models/stablelm/modeling_stablelm.py +1 -1
  556. transformers/models/starcoder2/modeling_starcoder2.py +1 -1
  557. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  558. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  559. transformers/models/swiftformer/modeling_swiftformer.py +4 -0
  560. transformers/models/swin/modeling_swin.py +16 -12
  561. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  562. transformers/models/swin2sr/modeling_swin2sr.py +49 -33
  563. transformers/models/swinv2/modeling_swinv2.py +41 -33
  564. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  565. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  566. transformers/models/t5/configuration_t5.py +7 -1
  567. transformers/models/t5/modeling_t5.py +1 -7
  568. transformers/models/t5gemma/modeling_t5gemma.py +1 -1
  569. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  570. transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
  571. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  572. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  573. transformers/models/table_transformer/modeling_table_transformer.py +1 -1
  574. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  575. transformers/models/timesfm/modeling_timesfm.py +12 -0
  576. transformers/models/timesfm/modular_timesfm.py +12 -0
  577. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  578. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  579. transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
  580. transformers/models/trocr/modeling_trocr.py +1 -2
  581. transformers/models/tvp/configuration_tvp.py +5 -1
  582. transformers/models/tvp/modeling_tvp.py +4 -4
  583. transformers/models/udop/configuration_udop.py +1 -0
  584. transformers/models/udop/modeling_udop.py +3 -7
  585. transformers/models/umt5/configuration_umt5.py +2 -2
  586. transformers/models/umt5/modeling_umt5.py +0 -6
  587. transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
  588. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  589. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  590. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  591. transformers/models/video_llava/modeling_video_llava.py +7 -3
  592. transformers/models/vilt/configuration_vilt.py +2 -2
  593. transformers/models/vilt/modeling_vilt.py +7 -0
  594. transformers/models/vipllava/modeling_vipllava.py +7 -3
  595. transformers/models/visual_bert/modeling_visual_bert.py +2 -0
  596. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  597. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  598. transformers/models/vitmatte/modeling_vitmatte.py +4 -0
  599. transformers/models/vitpose/configuration_vitpose.py +1 -1
  600. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  601. transformers/models/voxtral/modeling_voxtral.py +2 -2
  602. transformers/models/voxtral/modular_voxtral.py +2 -2
  603. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
  604. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
  605. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
  606. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  607. transformers/models/whisper/generation_whisper.py +1 -0
  608. transformers/models/whisper/modeling_whisper.py +5 -3
  609. transformers/models/x_clip/modeling_x_clip.py +2 -0
  610. transformers/models/xcodec/modeling_xcodec.py +5 -0
  611. transformers/models/xglm/modeling_xglm.py +10 -0
  612. transformers/models/xlm/modeling_xlm.py +13 -14
  613. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  614. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  615. transformers/models/xlnet/modeling_xlnet.py +3 -1
  616. transformers/models/xmod/modeling_xmod.py +3 -0
  617. transformers/models/yoso/modeling_yoso.py +4 -1
  618. transformers/models/zamba/modeling_zamba.py +2 -1
  619. transformers/models/zamba2/modeling_zamba2.py +3 -2
  620. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  621. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  622. transformers/models/zoedepth/modeling_zoedepth.py +7 -0
  623. transformers/pipelines/__init__.py +9 -6
  624. transformers/pipelines/automatic_speech_recognition.py +20 -12
  625. transformers/pipelines/base.py +1 -1
  626. transformers/pipelines/document_question_answering.py +1 -1
  627. transformers/pipelines/question_answering.py +1 -1
  628. transformers/pipelines/text_to_audio.py +2 -2
  629. transformers/processing_utils.py +127 -56
  630. transformers/quantizers/auto.py +2 -4
  631. transformers/quantizers/base.py +9 -64
  632. transformers/quantizers/quantizer_aqlm.py +1 -18
  633. transformers/quantizers/quantizer_auto_round.py +1 -10
  634. transformers/quantizers/quantizer_awq.py +3 -8
  635. transformers/quantizers/quantizer_bitnet.py +1 -6
  636. transformers/quantizers/quantizer_bnb_4bit.py +9 -49
  637. transformers/quantizers/quantizer_bnb_8bit.py +9 -19
  638. transformers/quantizers/quantizer_compressed_tensors.py +1 -4
  639. transformers/quantizers/quantizer_eetq.py +2 -12
  640. transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
  641. transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
  642. transformers/quantizers/quantizer_fp_quant.py +4 -4
  643. transformers/quantizers/quantizer_gptq.py +1 -4
  644. transformers/quantizers/quantizer_higgs.py +2 -6
  645. transformers/quantizers/quantizer_mxfp4.py +2 -28
  646. transformers/quantizers/quantizer_quanto.py +14 -14
  647. transformers/quantizers/quantizer_spqr.py +3 -8
  648. transformers/quantizers/quantizer_torchao.py +28 -124
  649. transformers/quantizers/quantizer_vptq.py +1 -10
  650. transformers/testing_utils.py +28 -12
  651. transformers/tokenization_mistral_common.py +3 -2
  652. transformers/tokenization_utils_base.py +3 -2
  653. transformers/tokenization_utils_tokenizers.py +25 -2
  654. transformers/trainer.py +24 -2
  655. transformers/trainer_callback.py +8 -0
  656. transformers/trainer_seq2seq.py +4 -0
  657. transformers/training_args.py +8 -10
  658. transformers/utils/__init__.py +4 -0
  659. transformers/utils/attention_visualizer.py +4 -4
  660. transformers/utils/auto_docstring.py +34 -25
  661. transformers/utils/generic.py +20 -0
  662. transformers/utils/import_utils.py +51 -9
  663. transformers/utils/kernel_config.py +71 -18
  664. transformers/utils/quantization_config.py +8 -8
  665. transformers/video_processing_utils.py +16 -12
  666. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
  667. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
  668. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
  669. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  670. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
  671. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -31,6 +31,7 @@ import torch.nn.functional as F
31
31
  from torch import nn
32
32
  from torch.nn import Parameter
33
33
 
34
+ from ... import initialization as init
34
35
  from ...activations import ACT2FN
35
36
  from ...cache_utils import Cache, DynamicCache
36
37
  from ...generation import GenerationMixin
@@ -62,6 +63,52 @@ from .configuration_qwen2_5_omni import (
62
63
  logger = logging.get_logger(__name__)
63
64
 
64
65
 
66
+ def kaiser_sinc_filter1d(cutoff, half_width, kernel_size):
67
+ """Generates a 1D Kaiser-windowed sinc filter.
68
+
69
+ Args:
70
+ cutoff (float): Normalized cutoff frequency (0 to 0.5).
71
+ half_width (float): Transition bandwidth.
72
+ kernel_size (int): Number of filter taps.
73
+
74
+ Returns:
75
+ torch.Tensor: A tensor of shape (1, 1, kernel_size) representing the filter.
76
+ """
77
+ is_even = kernel_size % 2 == 0
78
+ half_size = kernel_size // 2
79
+
80
+ # Compute Kaiser window parameters
81
+ delta_f = 4 * half_width
82
+ attenuation = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
83
+
84
+ if attenuation > 50.0:
85
+ beta = 0.1102 * (attenuation - 8.7)
86
+ elif attenuation >= 21.0:
87
+ beta = 0.5842 * (attenuation - 21) ** 0.4 + 0.07886 * (attenuation - 21.0)
88
+ else:
89
+ beta = 0.0
90
+
91
+ kaiser_window = torch.kaiser_window(kernel_size, beta=beta, periodic=False, dtype=torch.float32)
92
+
93
+ # Compute time indices
94
+ if is_even:
95
+ time_indices = torch.arange(-half_size, half_size) + 0.5
96
+ else:
97
+ time_indices = torch.arange(kernel_size) - half_size
98
+
99
+ # Compute sinc filter
100
+ if cutoff == 0:
101
+ return torch.zeros((1, 1, kernel_size), dtype=torch.float32) # Ensures correct shape
102
+
103
+ sinc_filter = torch.sinc(2 * cutoff * time_indices)
104
+ normalized_filter = 2 * cutoff * kaiser_window * sinc_filter
105
+
106
+ # Normalize to ensure sum = 1 (avoid leakage of constant component)
107
+ normalized_filter /= normalized_filter.sum()
108
+
109
+ return normalized_filter.view(1, 1, kernel_size)
110
+
111
+
65
112
  @auto_docstring
66
113
  class Qwen2_5OmniPreTrainedModel(PreTrainedModel):
67
114
  config: Qwen2_5OmniConfig
@@ -75,6 +122,23 @@ class Qwen2_5OmniPreTrainedModel(PreTrainedModel):
75
122
  _can_compile_fullgraph = False
76
123
  _supports_attention_backend = True
77
124
 
125
+ def _init_weights(self, module):
126
+ super()._init_weights(module)
127
+ if isinstance(module, SinusoidsPositionEmbedding):
128
+ log_timescale_increment = np.log(module.max_timescale) / (module.channels // 2 - 1)
129
+ inv_timescales = torch.exp(-log_timescale_increment * torch.arange(module.channels // 2).float())
130
+ scaled_time = torch.arange(module.length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
131
+ init.copy_(module.positional_embedding, torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1))
132
+ elif isinstance(module, UpSample1d):
133
+ filter_tensor = kaiser_sinc_filter1d(0.5 / module.ratio, 0.6 / module.ratio, module.kernel_size)
134
+ init.copy_(module.filter, filter_tensor)
135
+ elif isinstance(module, DownSample1d):
136
+ filter_tensor = kaiser_sinc_filter1d(module.cutoff, module.half_width, module.kernel_size)
137
+ init.copy_(module.filter, filter_tensor)
138
+ elif isinstance(module, Qwen2_5_VisionRotaryEmbedding):
139
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
140
+ init.copy_(module.inv_freq, inv_freq)
141
+
78
142
 
79
143
  class Qwen2_5OmniPreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedModel):
80
144
  input_modalities = ("image", "video", "audio", "text")
@@ -686,6 +750,9 @@ class Qwen2_5OmniAudioEncoderLayer(GradientCheckpointingLayer):
686
750
  class SinusoidsPositionEmbedding(nn.Module):
687
751
  def __init__(self, length, channels, max_timescale=10000):
688
752
  super().__init__()
753
+ self.length = length
754
+ self.channels = channels
755
+ self.max_timescale = max_timescale
689
756
  if channels % 2 != 0:
690
757
  raise ValueError("SinusoidsPositionEmbedding needs even channels input")
691
758
  log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
@@ -1018,6 +1085,22 @@ class Qwen2_5OmniVisionBlock(GradientCheckpointingLayer):
1018
1085
  return hidden_states
1019
1086
 
1020
1087
 
1088
+ class Qwen2_5_VisionRotaryEmbedding(nn.Module):
1089
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
1090
+
1091
+ def __init__(self, dim: int, theta: float = 10000.0) -> None:
1092
+ super().__init__()
1093
+ self.dim = dim
1094
+ self.theta = theta
1095
+ inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
1096
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
1097
+
1098
+ def forward(self, seqlen: int) -> torch.Tensor:
1099
+ seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
1100
+ freqs = torch.outer(seq, self.inv_freq)
1101
+ return freqs
1102
+
1103
+
1021
1104
  class Qwen2_5_VisionPatchEmbed(nn.Module):
1022
1105
  def __init__(
1023
1106
  self,
@@ -1044,20 +1127,6 @@ class Qwen2_5_VisionPatchEmbed(nn.Module):
1044
1127
  return hidden_states
1045
1128
 
1046
1129
 
1047
- class Qwen2_5_VisionRotaryEmbedding(nn.Module):
1048
- inv_freq: torch.Tensor # fix linting for `register_buffer`
1049
-
1050
- def __init__(self, dim: int, theta: float = 10000.0) -> None:
1051
- super().__init__()
1052
- inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
1053
- self.register_buffer("inv_freq", inv_freq, persistent=False)
1054
-
1055
- def forward(self, seqlen: int) -> torch.Tensor:
1056
- seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
1057
- freqs = torch.outer(seq, self.inv_freq)
1058
- return freqs
1059
-
1060
-
1061
1130
  class Qwen2_5OmniPatchMerger(nn.Module):
1062
1131
  def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
1063
1132
  super().__init__()
@@ -1105,6 +1174,8 @@ class Qwen2_5OmniVisionEncoder(Qwen2_5OmniPreTrainedModel):
1105
1174
  )
1106
1175
  self.gradient_checkpointing = False
1107
1176
 
1177
+ self.post_init()
1178
+
1108
1179
  def rot_pos_emb(self, grid_thw):
1109
1180
  pos_ids = []
1110
1181
  for t, h, w in grid_thw:
@@ -1252,7 +1323,7 @@ class Qwen2_5OmniRotaryEmbedding(nn.Module):
1252
1323
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
1253
1324
 
1254
1325
  self.register_buffer("inv_freq", inv_freq, persistent=False)
1255
- self.original_inv_freq = inv_freq
1326
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
1256
1327
 
1257
1328
  @staticmethod
1258
1329
  def compute_default_rope_parameters(
@@ -2033,6 +2104,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
2033
2104
  feature_attention_mask=None,
2034
2105
  use_audio_in_video=False,
2035
2106
  video_second_per_grid=None,
2107
+ is_first_iteration=False,
2036
2108
  **kwargs,
2037
2109
  ):
2038
2110
  model_inputs = super().prepare_inputs_for_generation(
@@ -2051,12 +2123,13 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
2051
2123
  feature_attention_mask=feature_attention_mask,
2052
2124
  use_audio_in_video=use_audio_in_video,
2053
2125
  video_second_per_grid=video_second_per_grid,
2126
+ is_first_iteration=is_first_iteration,
2054
2127
  **kwargs,
2055
2128
  )
2056
2129
 
2057
2130
  model_inputs["position_ids"] = None
2058
2131
 
2059
- if cache_position[0] != 0:
2132
+ if not is_first_iteration and use_cache:
2060
2133
  model_inputs["pixel_values"] = None
2061
2134
  model_inputs["pixel_values_videos"] = None
2062
2135
  model_inputs["input_features"] = None
@@ -2386,7 +2459,11 @@ class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCon
2386
2459
  self.rope_deltas = rope_deltas
2387
2460
 
2388
2461
  else:
2389
- batch_size, seq_length, _ = inputs_embeds.shape
2462
+ if inputs_embeds is not None:
2463
+ batch_size, seq_length, _ = inputs_embeds.shape
2464
+ else:
2465
+ batch_size, seq_length = input_ids.shape
2466
+
2390
2467
  delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
2391
2468
  position_ids = torch.arange(seq_length, device=input_ids.device)
2392
2469
  position_ids = position_ids.view(1, -1).expand(batch_size, -1)
@@ -2521,7 +2598,7 @@ class Qwen2_5OmniDiTRotaryEmbedding(nn.Module):
2521
2598
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
2522
2599
 
2523
2600
  self.register_buffer("inv_freq", inv_freq, persistent=False)
2524
- self.original_inv_freq = inv_freq
2601
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
2525
2602
 
2526
2603
  @staticmethod
2527
2604
  def compute_default_rope_parameters(
@@ -3184,52 +3261,6 @@ class SnakeBeta(nn.Module):
3184
3261
  return hidden_states
3185
3262
 
3186
3263
 
3187
- def kaiser_sinc_filter1d(cutoff, half_width, kernel_size):
3188
- """Generates a 1D Kaiser-windowed sinc filter.
3189
-
3190
- Args:
3191
- cutoff (float): Normalized cutoff frequency (0 to 0.5).
3192
- half_width (float): Transition bandwidth.
3193
- kernel_size (int): Number of filter taps.
3194
-
3195
- Returns:
3196
- torch.Tensor: A tensor of shape (1, 1, kernel_size) representing the filter.
3197
- """
3198
- is_even = kernel_size % 2 == 0
3199
- half_size = kernel_size // 2
3200
-
3201
- # Compute Kaiser window parameters
3202
- delta_f = 4 * half_width
3203
- attenuation = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
3204
-
3205
- if attenuation > 50.0:
3206
- beta = 0.1102 * (attenuation - 8.7)
3207
- elif attenuation >= 21.0:
3208
- beta = 0.5842 * (attenuation - 21) ** 0.4 + 0.07886 * (attenuation - 21.0)
3209
- else:
3210
- beta = 0.0
3211
-
3212
- kaiser_window = torch.kaiser_window(kernel_size, beta=beta, periodic=False, dtype=torch.float32)
3213
-
3214
- # Compute time indices
3215
- if is_even:
3216
- time_indices = torch.arange(-half_size, half_size) + 0.5
3217
- else:
3218
- time_indices = torch.arange(kernel_size) - half_size
3219
-
3220
- # Compute sinc filter
3221
- if cutoff == 0:
3222
- return torch.zeros((1, 1, kernel_size), dtype=torch.float32) # Ensures correct shape
3223
-
3224
- sinc_filter = torch.sinc(2 * cutoff * time_indices)
3225
- normalized_filter = 2 * cutoff * kaiser_window * sinc_filter
3226
-
3227
- # Normalize to ensure sum = 1 (avoid leakage of constant component)
3228
- normalized_filter /= normalized_filter.sum()
3229
-
3230
- return normalized_filter.view(1, 1, kernel_size)
3231
-
3232
-
3233
3264
  class UpSample1d(nn.Module):
3234
3265
  def __init__(self, ratio=2, kernel_size=None):
3235
3266
  super().__init__()
@@ -3260,6 +3291,9 @@ class DownSample1d(nn.Module):
3260
3291
  super().__init__()
3261
3292
  cutoff = 0.5 / ratio
3262
3293
  half_width = 0.6 / ratio
3294
+ self.cutoff = cutoff
3295
+ self.half_width = half_width
3296
+ self.kernel_size = kernel_size
3263
3297
 
3264
3298
  if cutoff < 0.0:
3265
3299
  raise ValueError("Minimum cutoff must be larger than zero.")
@@ -3441,6 +3475,8 @@ class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel):
3441
3475
  config.upsample_initial_channel // (2**self.num_upsample_layers), 1, 7, 1, padding=3, bias=False
3442
3476
  )
3443
3477
 
3478
+ self.post_init()
3479
+
3444
3480
  def normalize_spectrogram(self, spectrogram, max_value, min_db):
3445
3481
  return torch.clamp((2 * max_value) * ((spectrogram - min_db) / (-min_db)) - max_value, -max_value, max_value)
3446
3482
 
@@ -3568,6 +3604,8 @@ class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel):
3568
3604
  self.norm_out = Qwen2_5_OmniAdaLayerNormZero_Final(config.hidden_size) # final modulation
3569
3605
  self.proj_out = nn.Linear(config.hidden_size, config.mel_dim)
3570
3606
 
3607
+ self.post_init()
3608
+
3571
3609
  def _create_block_diff(self, hidden_states):
3572
3610
  batch, seq_len = hidden_states.shape[0], hidden_states.shape[1]
3573
3611
  block_indices = torch.arange(seq_len, device=hidden_states.device) // self.block_size # [seq_length]
@@ -3720,6 +3758,8 @@ class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel):
3720
3758
  config.bigvgan_config, attn_implementation=attn_impl
3721
3759
  )
3722
3760
 
3761
+ self.post_init()
3762
+
3723
3763
  def forward(
3724
3764
  self,
3725
3765
  code,
@@ -26,27 +26,13 @@ import torch.nn.functional as F
26
26
  from torch import nn
27
27
  from torch.nn import Parameter
28
28
 
29
- from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding, rotate_half
30
- from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
31
- from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
32
- Qwen2_5_VisionTransformerPretrainedModel,
33
- Qwen2_5_VLAttention,
34
- Qwen2_5_VLMLP,
35
- Qwen2_5_VLPreTrainedModel,
36
- Qwen2_5_VLTextModel,
37
- Qwen2_5_VLVisionBlock,
38
- eager_attention_forward,
39
- )
40
- from transformers.models.qwen2_audio.configuration_qwen2_audio import Qwen2AudioEncoderConfig
41
- from transformers.models.qwen2_audio.modeling_qwen2_audio import Qwen2AudioEncoderLayer
42
- from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLRotaryEmbedding
43
-
29
+ from ... import initialization as init
44
30
  from ...cache_utils import Cache
45
31
  from ...configuration_utils import PreTrainedConfig, layer_type_validation
46
32
  from ...generation import GenerationMixin
47
33
  from ...modeling_outputs import BaseModelOutput, ModelOutput
48
34
  from ...modeling_rope_utils import RopeParameters
49
- from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
35
+ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
50
36
  from ...processing_utils import Unpack
51
37
  from ...utils import (
52
38
  TransformersKwargs,
@@ -56,6 +42,21 @@ from ...utils import (
56
42
  )
57
43
  from ...utils.deprecation import deprecate_kwarg
58
44
  from ...utils.hub import cached_file
45
+ from ..llama.modeling_llama import LlamaRotaryEmbedding, rotate_half
46
+ from ..qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
47
+ from ..qwen2_5_vl.modeling_qwen2_5_vl import (
48
+ Qwen2_5_VisionRotaryEmbedding,
49
+ Qwen2_5_VisionTransformerPretrainedModel,
50
+ Qwen2_5_VLAttention,
51
+ Qwen2_5_VLMLP,
52
+ Qwen2_5_VLPreTrainedModel,
53
+ Qwen2_5_VLTextModel,
54
+ Qwen2_5_VLVisionBlock,
55
+ eager_attention_forward,
56
+ )
57
+ from ..qwen2_audio.configuration_qwen2_audio import Qwen2AudioEncoderConfig
58
+ from ..qwen2_audio.modeling_qwen2_audio import Qwen2AudioEncoderLayer
59
+ from ..qwen2_vl.modeling_qwen2_vl import Qwen2VLRotaryEmbedding
59
60
 
60
61
 
61
62
  logger = logging.get_logger(__name__)
@@ -1054,6 +1055,23 @@ class Qwen2_5OmniPreTrainedModel(Qwen2_5_VLPreTrainedModel):
1054
1055
  input_modalities = ("image", "video", "audio", "text")
1055
1056
  _can_compile_fullgraph = False
1056
1057
 
1058
+ def _init_weights(self, module):
1059
+ PreTrainedModel._init_weights(self, module)
1060
+ if isinstance(module, SinusoidsPositionEmbedding):
1061
+ log_timescale_increment = np.log(module.max_timescale) / (module.channels // 2 - 1)
1062
+ inv_timescales = torch.exp(-log_timescale_increment * torch.arange(module.channels // 2).float())
1063
+ scaled_time = torch.arange(module.length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
1064
+ init.copy_(module.positional_embedding, torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1))
1065
+ elif isinstance(module, UpSample1d):
1066
+ filter_tensor = kaiser_sinc_filter1d(0.5 / module.ratio, 0.6 / module.ratio, module.kernel_size)
1067
+ init.copy_(module.filter, filter_tensor)
1068
+ elif isinstance(module, DownSample1d):
1069
+ filter_tensor = kaiser_sinc_filter1d(module.cutoff, module.half_width, module.kernel_size)
1070
+ init.copy_(module.filter, filter_tensor)
1071
+ elif isinstance(module, Qwen2_5_VisionRotaryEmbedding):
1072
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
1073
+ init.copy_(module.inv_freq, inv_freq)
1074
+
1057
1075
 
1058
1076
  class Qwen2_5OmniPreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedModel):
1059
1077
  input_modalities = ("image", "video", "audio", "text")
@@ -1610,6 +1628,9 @@ class Qwen2_5OmniAudioEncoderLayer(Qwen2AudioEncoderLayer):
1610
1628
  class SinusoidsPositionEmbedding(nn.Module):
1611
1629
  def __init__(self, length, channels, max_timescale=10000):
1612
1630
  super().__init__()
1631
+ self.length = length
1632
+ self.channels = channels
1633
+ self.max_timescale = max_timescale
1613
1634
  if channels % 2 != 0:
1614
1635
  raise ValueError("SinusoidsPositionEmbedding needs even channels input")
1615
1636
  log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
@@ -1918,6 +1939,10 @@ class Qwen2_5OmniVisionBlock(Qwen2_5_VLVisionBlock):
1918
1939
  return hidden_states
1919
1940
 
1920
1941
 
1942
+ class Qwen2_5_VisionRotaryEmbedding(Qwen2_5_VisionRotaryEmbedding):
1943
+ pass
1944
+
1945
+
1921
1946
  class Qwen2_5OmniVisionEncoder(Qwen2_5_VisionTransformerPretrainedModel):
1922
1947
  config: Qwen2_5OmniVisionEncoderConfig
1923
1948
  input_modalities = ("image", "video")
@@ -2382,6 +2407,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
2382
2407
  feature_attention_mask=None,
2383
2408
  use_audio_in_video=False,
2384
2409
  video_second_per_grid=None,
2410
+ is_first_iteration=False,
2385
2411
  **kwargs,
2386
2412
  ):
2387
2413
  model_inputs = super().prepare_inputs_for_generation(
@@ -2400,12 +2426,13 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
2400
2426
  feature_attention_mask=feature_attention_mask,
2401
2427
  use_audio_in_video=use_audio_in_video,
2402
2428
  video_second_per_grid=video_second_per_grid,
2429
+ is_first_iteration=is_first_iteration,
2403
2430
  **kwargs,
2404
2431
  )
2405
2432
 
2406
2433
  model_inputs["position_ids"] = None
2407
2434
 
2408
- if cache_position[0] != 0:
2435
+ if not is_first_iteration and use_cache:
2409
2436
  model_inputs["pixel_values"] = None
2410
2437
  model_inputs["pixel_values_videos"] = None
2411
2438
  model_inputs["input_features"] = None
@@ -2588,7 +2615,11 @@ class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCon
2588
2615
  self.rope_deltas = rope_deltas
2589
2616
 
2590
2617
  else:
2591
- batch_size, seq_length, _ = inputs_embeds.shape
2618
+ if inputs_embeds is not None:
2619
+ batch_size, seq_length, _ = inputs_embeds.shape
2620
+ else:
2621
+ batch_size, seq_length = input_ids.shape
2622
+
2592
2623
  delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
2593
2624
  position_ids = torch.arange(seq_length, device=input_ids.device)
2594
2625
  position_ids = position_ids.view(1, -1).expand(batch_size, -1)
@@ -3419,6 +3450,9 @@ class DownSample1d(nn.Module):
3419
3450
  super().__init__()
3420
3451
  cutoff = 0.5 / ratio
3421
3452
  half_width = 0.6 / ratio
3453
+ self.cutoff = cutoff
3454
+ self.half_width = half_width
3455
+ self.kernel_size = kernel_size
3422
3456
 
3423
3457
  if cutoff < 0.0:
3424
3458
  raise ValueError("Minimum cutoff must be larger than zero.")
@@ -3600,6 +3634,8 @@ class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel):
3600
3634
  config.upsample_initial_channel // (2**self.num_upsample_layers), 1, 7, 1, padding=3, bias=False
3601
3635
  )
3602
3636
 
3637
+ self.post_init()
3638
+
3603
3639
  def normalize_spectrogram(self, spectrogram, max_value, min_db):
3604
3640
  return torch.clamp((2 * max_value) * ((spectrogram - min_db) / (-min_db)) - max_value, -max_value, max_value)
3605
3641
 
@@ -3727,6 +3763,8 @@ class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel):
3727
3763
  self.norm_out = Qwen2_5_OmniAdaLayerNormZero_Final(config.hidden_size) # final modulation
3728
3764
  self.proj_out = nn.Linear(config.hidden_size, config.mel_dim)
3729
3765
 
3766
+ self.post_init()
3767
+
3730
3768
  def _create_block_diff(self, hidden_states):
3731
3769
  batch, seq_len = hidden_states.shape[0], hidden_states.shape[1]
3732
3770
  block_indices = torch.arange(seq_len, device=hidden_states.device) // self.block_size # [seq_length]
@@ -3879,6 +3917,8 @@ class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel):
3879
3917
  config.bigvgan_config, attn_implementation=attn_impl
3880
3918
  )
3881
3919
 
3920
+ self.post_init()
3921
+
3882
3922
  def forward(
3883
3923
  self,
3884
3924
  code,
@@ -32,6 +32,7 @@ import torch
32
32
  import torch.nn as nn
33
33
  import torch.nn.functional as F
34
34
 
35
+ from ... import initialization as init
35
36
  from ...activations import ACT2FN
36
37
  from ...cache_utils import Cache, DynamicCache
37
38
  from ...generation import GenerationMixin
@@ -96,6 +97,8 @@ class Qwen2_5_VisionRotaryEmbedding(nn.Module):
96
97
 
97
98
  def __init__(self, dim: int, theta: float = 10000.0) -> None:
98
99
  super().__init__()
100
+ self.dim = dim
101
+ self.theta = theta
99
102
  inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
100
103
  self.register_buffer("inv_freq", inv_freq, persistent=False)
101
104
 
@@ -217,8 +220,8 @@ class Qwen2_5_VLVisionAttention(nn.Module):
217
220
  if self.config._attn_implementation != "eager":
218
221
  attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
219
222
 
220
- if self.config._attn_implementation == "flash_attention_2":
221
- # Flash Attention 2: Use cu_seqlens for variable length attention
223
+ if "flash" in self.config._attn_implementation:
224
+ # Flash Attention: Use cu_seqlens for variable length attention
222
225
  max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
223
226
  attn_output, _ = attention_interface(
224
227
  self,
@@ -304,6 +307,12 @@ class Qwen2_5_VLPreTrainedModel(PreTrainedModel):
304
307
  _can_compile_fullgraph = True
305
308
  _supports_attention_backend = True
306
309
 
310
+ def _init_weights(self, module):
311
+ super()._init_weights(module)
312
+ if isinstance(module, Qwen2_5_VisionRotaryEmbedding):
313
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
314
+ init.copy_(module.inv_freq, inv_freq)
315
+
307
316
 
308
317
  class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
309
318
  config: Qwen2_5_VLVisionConfig
@@ -336,6 +345,8 @@ class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
336
345
  )
337
346
  self.gradient_checkpointing = False
338
347
 
348
+ self.post_init()
349
+
339
350
  def rot_pos_emb(self, grid_thw):
340
351
  pos_ids = []
341
352
  for t, h, w in grid_thw:
@@ -508,7 +519,7 @@ class Qwen2_5_VLRotaryEmbedding(nn.Module):
508
519
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
509
520
 
510
521
  self.register_buffer("inv_freq", inv_freq, persistent=False)
511
- self.original_inv_freq = inv_freq
522
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
512
523
 
513
524
  @staticmethod
514
525
  def compute_default_rope_parameters(
@@ -1525,6 +1536,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
1525
1536
  image_grid_thw=None,
1526
1537
  video_grid_thw=None,
1527
1538
  second_per_grid_ts=None,
1539
+ is_first_iteration=False,
1528
1540
  **kwargs,
1529
1541
  ):
1530
1542
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -1542,6 +1554,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
1542
1554
  video_grid_thw=video_grid_thw,
1543
1555
  second_per_grid_ts=second_per_grid_ts,
1544
1556
  use_cache=use_cache,
1557
+ is_first_iteration=is_first_iteration,
1545
1558
  **kwargs,
1546
1559
  )
1547
1560
 
@@ -1551,7 +1564,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
1551
1564
  # When compiling, we can't check tensor values thus we check only input length
1552
1565
  # It is safe to assume that `length!=1` means we're in pre-fill because compiled
1553
1566
  # models currently cannot do assisted decoding
1554
- if cache_position[0] == 0 or self.model.rope_deltas is None:
1567
+ if (cache_position[0] == 0 or not use_cache) or self.model.rope_deltas is None:
1555
1568
  vision_positions, rope_deltas = self.model.get_rope_index(
1556
1569
  model_inputs.get("input_ids", None),
1557
1570
  image_grid_thw=image_grid_thw,
@@ -1574,7 +1587,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
1574
1587
  text_positions = model_inputs["position_ids"][None, ...]
1575
1588
  model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0)
1576
1589
 
1577
- if cache_position[0] != 0:
1590
+ if not is_first_iteration and use_cache:
1578
1591
  model_inputs["pixel_values"] = None
1579
1592
  model_inputs["pixel_values_videos"] = None
1580
1593
 
@@ -26,8 +26,20 @@ import torch
26
26
  import torch.nn as nn
27
27
  import torch.nn.functional as F
28
28
 
29
- from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig, Qwen2VLTextConfig
30
- from transformers.models.qwen2_vl.modeling_qwen2_vl import (
29
+ from ... import initialization as init
30
+ from ...activations import ACT2FN
31
+ from ...cache_utils import Cache
32
+ from ...configuration_utils import PreTrainedConfig
33
+ from ...feature_extraction_utils import BatchFeature
34
+ from ...image_utils import ImageInput
35
+ from ...modeling_layers import GradientCheckpointingLayer
36
+ from ...modeling_utils import PreTrainedModel
37
+ from ...processing_utils import MultiModalData, ProcessingKwargs, Unpack
38
+ from ...tokenization_utils_base import PreTokenizedInput, TextInput
39
+ from ...utils import logging
40
+ from ...video_utils import VideoInput
41
+ from ..qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig, Qwen2VLTextConfig
42
+ from ..qwen2_vl.modeling_qwen2_vl import (
31
43
  PatchEmbed,
32
44
  PatchMerger,
33
45
  Qwen2RMSNorm,
@@ -40,23 +52,7 @@ from transformers.models.qwen2_vl.modeling_qwen2_vl import (
40
52
  VisionAttention,
41
53
  VisionRotaryEmbedding,
42
54
  )
43
- from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
44
-
45
- from ...activations import ACT2FN
46
- from ...cache_utils import Cache
47
- from ...configuration_utils import PreTrainedConfig
48
- from ...feature_extraction_utils import BatchFeature
49
- from ...image_utils import ImageInput
50
- from ...modeling_flash_attention_utils import is_flash_attn_available
51
- from ...modeling_layers import GradientCheckpointingLayer
52
- from ...processing_utils import MultiModalData, ProcessingKwargs, Unpack
53
- from ...tokenization_utils_base import PreTokenizedInput, TextInput
54
- from ...utils import logging
55
- from ...video_utils import VideoInput
56
-
57
-
58
- if is_flash_attn_available():
59
- pass
55
+ from ..qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
60
56
 
61
57
 
62
58
  logger = logging.get_logger(__name__)
@@ -173,7 +169,11 @@ class Qwen2_5_VLVisionBlock(GradientCheckpointingLayer):
173
169
 
174
170
 
175
171
  class Qwen2_5_VLPreTrainedModel(Qwen2VLPreTrainedModel):
176
- pass
172
+ def _init_weights(self, module):
173
+ PreTrainedModel._init_weights(self, module)
174
+ if isinstance(module, Qwen2_5_VisionRotaryEmbedding):
175
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
176
+ init.copy_(module.inv_freq, inv_freq)
177
177
 
178
178
 
179
179
  class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
@@ -207,6 +207,8 @@ class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
207
207
  )
208
208
  self.gradient_checkpointing = False
209
209
 
210
+ self.post_init()
211
+
210
212
  def rot_pos_emb(self, grid_thw):
211
213
  pos_ids = []
212
214
  for t, h, w in grid_thw:
@@ -776,6 +778,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
776
778
  image_grid_thw=None,
777
779
  video_grid_thw=None,
778
780
  second_per_grid_ts=None,
781
+ is_first_iteration=False,
779
782
  **kwargs,
780
783
  ):
781
784
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -793,6 +796,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
793
796
  video_grid_thw=video_grid_thw,
794
797
  second_per_grid_ts=second_per_grid_ts,
795
798
  use_cache=use_cache,
799
+ is_first_iteration=is_first_iteration,
796
800
  **kwargs,
797
801
  )
798
802
 
@@ -802,7 +806,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
802
806
  # When compiling, we can't check tensor values thus we check only input length
803
807
  # It is safe to assume that `length!=1` means we're in pre-fill because compiled
804
808
  # models currently cannot do assisted decoding
805
- if cache_position[0] == 0 or self.model.rope_deltas is None:
809
+ if (cache_position[0] == 0 or not use_cache) or self.model.rope_deltas is None:
806
810
  vision_positions, rope_deltas = self.model.get_rope_index(
807
811
  model_inputs.get("input_ids", None),
808
812
  image_grid_thw=image_grid_thw,
@@ -825,7 +829,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
825
829
  text_positions = model_inputs["position_ids"][None, ...]
826
830
  model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0)
827
831
 
828
- if cache_position[0] != 0:
832
+ if not is_first_iteration and use_cache:
829
833
  model_inputs["pixel_values"] = None
830
834
  model_inputs["pixel_values_videos"] = None
831
835
 
@@ -848,11 +848,11 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi
848
848
  # Overwritten -- we should not pass input_features when we are in cached decoding stage
849
849
 
850
850
  input_features = kwargs.pop("input_features", None)
851
- cache_position = kwargs.get("cache_position")
851
+ is_first_iteration = kwargs.get("is_first_iteration", False)
852
852
 
853
853
  model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
854
854
 
855
- if cache_position is not None and cache_position[0] == 0:
855
+ if is_first_iteration or not kwargs.get("use_cache", True):
856
856
  # input_features should only be passed when we are not in cached decoding stage
857
857
  model_inputs["input_features"] = input_features
858
858