transformers 5.0.0__py3-none-any.whl → 5.0.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1606) hide show
  1. transformers/__init__.py +36 -55
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +33 -32
  4. transformers/cache_utils.py +139 -32
  5. transformers/cli/chat.py +3 -3
  6. transformers/cli/serve.py +19 -49
  7. transformers/cli/transformers.py +1 -2
  8. transformers/configuration_utils.py +155 -129
  9. transformers/conversion_mapping.py +22 -158
  10. transformers/convert_slow_tokenizer.py +17 -227
  11. transformers/core_model_loading.py +185 -528
  12. transformers/data/data_collator.py +4 -12
  13. transformers/data/processors/glue.py +1 -0
  14. transformers/data/processors/utils.py +1 -0
  15. transformers/data/processors/xnli.py +1 -0
  16. transformers/dependency_versions_check.py +1 -0
  17. transformers/dependency_versions_table.py +7 -5
  18. transformers/distributed/configuration_utils.py +2 -1
  19. transformers/dynamic_module_utils.py +25 -24
  20. transformers/feature_extraction_sequence_utils.py +23 -19
  21. transformers/feature_extraction_utils.py +33 -64
  22. transformers/file_utils.py +1 -0
  23. transformers/generation/__init__.py +1 -11
  24. transformers/generation/candidate_generator.py +33 -80
  25. transformers/generation/configuration_utils.py +133 -189
  26. transformers/generation/continuous_batching/__init__.py +1 -4
  27. transformers/generation/continuous_batching/cache.py +25 -83
  28. transformers/generation/continuous_batching/cache_manager.py +45 -155
  29. transformers/generation/continuous_batching/continuous_api.py +147 -270
  30. transformers/generation/continuous_batching/requests.py +3 -51
  31. transformers/generation/continuous_batching/scheduler.py +105 -160
  32. transformers/generation/logits_process.py +128 -0
  33. transformers/generation/stopping_criteria.py +1 -1
  34. transformers/generation/streamers.py +1 -0
  35. transformers/generation/utils.py +123 -122
  36. transformers/generation/watermarking.py +6 -8
  37. transformers/hf_argparser.py +13 -9
  38. transformers/hyperparameter_search.py +2 -1
  39. transformers/image_processing_base.py +23 -12
  40. transformers/image_processing_utils.py +15 -11
  41. transformers/image_processing_utils_fast.py +75 -85
  42. transformers/image_transforms.py +42 -73
  43. transformers/image_utils.py +32 -30
  44. transformers/initialization.py +0 -37
  45. transformers/integrations/__init__.py +2 -16
  46. transformers/integrations/accelerate.py +113 -58
  47. transformers/integrations/aqlm.py +66 -36
  48. transformers/integrations/awq.py +516 -45
  49. transformers/integrations/bitnet.py +105 -47
  50. transformers/integrations/bitsandbytes.py +202 -91
  51. transformers/integrations/deepspeed.py +4 -161
  52. transformers/integrations/eetq.py +82 -84
  53. transformers/integrations/executorch.py +1 -1
  54. transformers/integrations/fbgemm_fp8.py +145 -190
  55. transformers/integrations/finegrained_fp8.py +215 -249
  56. transformers/integrations/flash_attention.py +3 -3
  57. transformers/integrations/flex_attention.py +1 -1
  58. transformers/integrations/fp_quant.py +0 -90
  59. transformers/integrations/ggml.py +2 -11
  60. transformers/integrations/higgs.py +62 -37
  61. transformers/integrations/hub_kernels.py +8 -65
  62. transformers/integrations/integration_utils.py +3 -47
  63. transformers/integrations/mistral.py +0 -12
  64. transformers/integrations/mxfp4.py +80 -33
  65. transformers/integrations/peft.py +191 -483
  66. transformers/integrations/quanto.py +56 -77
  67. transformers/integrations/spqr.py +90 -42
  68. transformers/integrations/tensor_parallel.py +221 -167
  69. transformers/integrations/torchao.py +43 -35
  70. transformers/integrations/vptq.py +59 -40
  71. transformers/kernels/__init__.py +0 -0
  72. transformers/{models/pe_audio_video/processing_pe_audio_video.py → kernels/falcon_mamba/__init__.py} +3 -12
  73. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +529 -0
  74. transformers/loss/loss_utils.py +0 -2
  75. transformers/masking_utils.py +55 -51
  76. transformers/model_debugging_utils.py +5 -4
  77. transformers/modelcard.py +194 -15
  78. transformers/modeling_attn_mask_utils.py +19 -19
  79. transformers/modeling_flash_attention_utils.py +27 -27
  80. transformers/modeling_gguf_pytorch_utils.py +24 -79
  81. transformers/modeling_layers.py +22 -21
  82. transformers/modeling_outputs.py +253 -242
  83. transformers/modeling_rope_utils.py +117 -138
  84. transformers/modeling_utils.py +739 -850
  85. transformers/models/__init__.py +0 -27
  86. transformers/models/afmoe/configuration_afmoe.py +33 -40
  87. transformers/models/afmoe/modeling_afmoe.py +54 -42
  88. transformers/models/afmoe/modular_afmoe.py +33 -23
  89. transformers/models/aimv2/configuration_aimv2.py +10 -2
  90. transformers/models/aimv2/modeling_aimv2.py +42 -47
  91. transformers/models/aimv2/modular_aimv2.py +19 -17
  92. transformers/models/albert/configuration_albert.py +2 -8
  93. transformers/models/albert/modeling_albert.py +69 -70
  94. transformers/models/albert/tokenization_albert.py +14 -5
  95. transformers/models/align/configuration_align.py +6 -8
  96. transformers/models/align/modeling_align.py +89 -94
  97. transformers/models/align/processing_align.py +30 -2
  98. transformers/models/altclip/configuration_altclip.py +7 -4
  99. transformers/models/altclip/modeling_altclip.py +103 -114
  100. transformers/models/altclip/processing_altclip.py +15 -2
  101. transformers/models/apertus/__init__.py +1 -0
  102. transformers/models/apertus/configuration_apertus.py +28 -23
  103. transformers/models/apertus/modeling_apertus.py +40 -39
  104. transformers/models/apertus/modular_apertus.py +38 -37
  105. transformers/models/arcee/configuration_arcee.py +30 -25
  106. transformers/models/arcee/modeling_arcee.py +39 -36
  107. transformers/models/arcee/modular_arcee.py +23 -20
  108. transformers/models/aria/configuration_aria.py +44 -31
  109. transformers/models/aria/image_processing_aria.py +27 -25
  110. transformers/models/aria/modeling_aria.py +106 -110
  111. transformers/models/aria/modular_aria.py +127 -118
  112. transformers/models/aria/processing_aria.py +35 -28
  113. transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +1 -0
  114. transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +6 -3
  115. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +8 -6
  116. transformers/models/audioflamingo3/__init__.py +1 -0
  117. transformers/models/audioflamingo3/configuration_audioflamingo3.py +1 -0
  118. transformers/models/audioflamingo3/modeling_audioflamingo3.py +49 -58
  119. transformers/models/audioflamingo3/modular_audioflamingo3.py +43 -53
  120. transformers/models/audioflamingo3/processing_audioflamingo3.py +30 -33
  121. transformers/models/auto/auto_factory.py +7 -6
  122. transformers/models/auto/configuration_auto.py +5 -66
  123. transformers/models/auto/feature_extraction_auto.py +10 -14
  124. transformers/models/auto/image_processing_auto.py +41 -32
  125. transformers/models/auto/modeling_auto.py +188 -46
  126. transformers/models/auto/processing_auto.py +11 -24
  127. transformers/models/auto/tokenization_auto.py +588 -171
  128. transformers/models/auto/video_processing_auto.py +10 -12
  129. transformers/models/autoformer/configuration_autoformer.py +7 -4
  130. transformers/models/autoformer/modeling_autoformer.py +101 -104
  131. transformers/models/aya_vision/configuration_aya_vision.py +1 -4
  132. transformers/models/aya_vision/modeling_aya_vision.py +102 -71
  133. transformers/models/aya_vision/modular_aya_vision.py +74 -46
  134. transformers/models/aya_vision/processing_aya_vision.py +53 -25
  135. transformers/models/bamba/configuration_bamba.py +39 -34
  136. transformers/models/bamba/modeling_bamba.py +86 -82
  137. transformers/models/bamba/modular_bamba.py +72 -70
  138. transformers/models/bark/configuration_bark.py +8 -6
  139. transformers/models/bark/generation_configuration_bark.py +5 -3
  140. transformers/models/bark/modeling_bark.py +57 -54
  141. transformers/models/bark/processing_bark.py +41 -19
  142. transformers/models/bart/configuration_bart.py +6 -9
  143. transformers/models/bart/modeling_bart.py +126 -135
  144. transformers/models/barthez/tokenization_barthez.py +11 -3
  145. transformers/models/bartpho/tokenization_bartpho.py +7 -6
  146. transformers/models/beit/configuration_beit.py +11 -0
  147. transformers/models/beit/image_processing_beit.py +56 -53
  148. transformers/models/beit/image_processing_beit_fast.py +12 -10
  149. transformers/models/beit/modeling_beit.py +60 -69
  150. transformers/models/bert/configuration_bert.py +2 -12
  151. transformers/models/bert/modeling_bert.py +122 -114
  152. transformers/models/bert/tokenization_bert.py +23 -8
  153. transformers/models/bert/tokenization_bert_legacy.py +5 -3
  154. transformers/models/bert_generation/configuration_bert_generation.py +2 -17
  155. transformers/models/bert_generation/modeling_bert_generation.py +49 -49
  156. transformers/models/bert_generation/tokenization_bert_generation.py +3 -2
  157. transformers/models/bert_japanese/tokenization_bert_japanese.py +6 -5
  158. transformers/models/bertweet/tokenization_bertweet.py +3 -1
  159. transformers/models/big_bird/configuration_big_bird.py +9 -12
  160. transformers/models/big_bird/modeling_big_bird.py +109 -116
  161. transformers/models/big_bird/tokenization_big_bird.py +43 -16
  162. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -9
  163. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +117 -130
  164. transformers/models/biogpt/configuration_biogpt.py +2 -8
  165. transformers/models/biogpt/modeling_biogpt.py +76 -72
  166. transformers/models/biogpt/modular_biogpt.py +66 -62
  167. transformers/models/biogpt/tokenization_biogpt.py +5 -3
  168. transformers/models/bit/configuration_bit.py +1 -0
  169. transformers/models/bit/image_processing_bit.py +24 -21
  170. transformers/models/bit/image_processing_bit_fast.py +1 -0
  171. transformers/models/bit/modeling_bit.py +12 -25
  172. transformers/models/bitnet/configuration_bitnet.py +28 -23
  173. transformers/models/bitnet/modeling_bitnet.py +39 -36
  174. transformers/models/bitnet/modular_bitnet.py +6 -4
  175. transformers/models/blenderbot/configuration_blenderbot.py +5 -8
  176. transformers/models/blenderbot/modeling_blenderbot.py +96 -77
  177. transformers/models/blenderbot/tokenization_blenderbot.py +24 -18
  178. transformers/models/blenderbot_small/configuration_blenderbot_small.py +5 -8
  179. transformers/models/blenderbot_small/modeling_blenderbot_small.py +69 -79
  180. transformers/models/blenderbot_small/tokenization_blenderbot_small.py +3 -1
  181. transformers/models/blip/configuration_blip.py +10 -9
  182. transformers/models/blip/image_processing_blip.py +20 -17
  183. transformers/models/blip/image_processing_blip_fast.py +1 -0
  184. transformers/models/blip/modeling_blip.py +108 -117
  185. transformers/models/blip/modeling_blip_text.py +65 -73
  186. transformers/models/blip/processing_blip.py +36 -5
  187. transformers/models/blip_2/configuration_blip_2.py +2 -2
  188. transformers/models/blip_2/modeling_blip_2.py +118 -146
  189. transformers/models/blip_2/processing_blip_2.py +38 -8
  190. transformers/models/bloom/configuration_bloom.py +2 -5
  191. transformers/models/bloom/modeling_bloom.py +104 -77
  192. transformers/models/blt/configuration_blt.py +86 -94
  193. transformers/models/blt/modeling_blt.py +81 -238
  194. transformers/models/blt/modular_blt.py +65 -228
  195. transformers/models/bridgetower/configuration_bridgetower.py +2 -7
  196. transformers/models/bridgetower/image_processing_bridgetower.py +35 -34
  197. transformers/models/bridgetower/image_processing_bridgetower_fast.py +16 -13
  198. transformers/models/bridgetower/modeling_bridgetower.py +119 -141
  199. transformers/models/bridgetower/processing_bridgetower.py +16 -2
  200. transformers/models/bros/configuration_bros.py +18 -24
  201. transformers/models/bros/modeling_bros.py +80 -90
  202. transformers/models/bros/processing_bros.py +12 -2
  203. transformers/models/byt5/tokenization_byt5.py +6 -4
  204. transformers/models/camembert/configuration_camembert.py +2 -8
  205. transformers/models/camembert/modeling_camembert.py +195 -196
  206. transformers/models/camembert/modular_camembert.py +54 -51
  207. transformers/models/camembert/tokenization_camembert.py +13 -6
  208. transformers/models/canine/configuration_canine.py +2 -4
  209. transformers/models/canine/modeling_canine.py +75 -84
  210. transformers/models/canine/tokenization_canine.py +1 -2
  211. transformers/models/chameleon/configuration_chameleon.py +34 -29
  212. transformers/models/chameleon/image_processing_chameleon.py +24 -21
  213. transformers/models/chameleon/image_processing_chameleon_fast.py +6 -5
  214. transformers/models/chameleon/modeling_chameleon.py +93 -142
  215. transformers/models/chameleon/processing_chameleon.py +41 -16
  216. transformers/models/chinese_clip/configuration_chinese_clip.py +8 -10
  217. transformers/models/chinese_clip/image_processing_chinese_clip.py +24 -21
  218. transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +1 -0
  219. transformers/models/chinese_clip/modeling_chinese_clip.py +92 -96
  220. transformers/models/chinese_clip/processing_chinese_clip.py +15 -2
  221. transformers/models/clap/configuration_clap.py +9 -4
  222. transformers/models/clap/feature_extraction_clap.py +12 -11
  223. transformers/models/clap/modeling_clap.py +123 -136
  224. transformers/models/clap/processing_clap.py +15 -2
  225. transformers/models/clip/configuration_clip.py +2 -4
  226. transformers/models/clip/image_processing_clip.py +24 -21
  227. transformers/models/clip/image_processing_clip_fast.py +1 -9
  228. transformers/models/clip/modeling_clip.py +65 -65
  229. transformers/models/clip/processing_clip.py +14 -2
  230. transformers/models/clip/tokenization_clip.py +46 -21
  231. transformers/models/clipseg/configuration_clipseg.py +2 -4
  232. transformers/models/clipseg/modeling_clipseg.py +109 -119
  233. transformers/models/clipseg/processing_clipseg.py +42 -19
  234. transformers/models/clvp/configuration_clvp.py +5 -15
  235. transformers/models/clvp/feature_extraction_clvp.py +10 -7
  236. transformers/models/clvp/modeling_clvp.py +146 -155
  237. transformers/models/clvp/number_normalizer.py +2 -1
  238. transformers/models/clvp/processing_clvp.py +20 -3
  239. transformers/models/clvp/tokenization_clvp.py +64 -1
  240. transformers/models/code_llama/tokenization_code_llama.py +44 -18
  241. transformers/models/codegen/configuration_codegen.py +4 -4
  242. transformers/models/codegen/modeling_codegen.py +53 -63
  243. transformers/models/codegen/tokenization_codegen.py +47 -17
  244. transformers/models/cohere/configuration_cohere.py +30 -25
  245. transformers/models/cohere/modeling_cohere.py +42 -40
  246. transformers/models/cohere/modular_cohere.py +29 -26
  247. transformers/models/cohere/tokenization_cohere.py +46 -15
  248. transformers/models/cohere2/configuration_cohere2.py +32 -31
  249. transformers/models/cohere2/modeling_cohere2.py +44 -42
  250. transformers/models/cohere2/modular_cohere2.py +54 -54
  251. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +14 -13
  252. transformers/models/cohere2_vision/modeling_cohere2_vision.py +58 -59
  253. transformers/models/cohere2_vision/modular_cohere2_vision.py +46 -45
  254. transformers/models/cohere2_vision/processing_cohere2_vision.py +36 -6
  255. transformers/models/colpali/configuration_colpali.py +1 -0
  256. transformers/models/colpali/modeling_colpali.py +16 -14
  257. transformers/models/colpali/modular_colpali.py +51 -11
  258. transformers/models/colpali/processing_colpali.py +52 -14
  259. transformers/models/colqwen2/modeling_colqwen2.py +28 -28
  260. transformers/models/colqwen2/modular_colqwen2.py +74 -37
  261. transformers/models/colqwen2/processing_colqwen2.py +52 -16
  262. transformers/models/conditional_detr/configuration_conditional_detr.py +2 -1
  263. transformers/models/conditional_detr/image_processing_conditional_detr.py +70 -67
  264. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +36 -36
  265. transformers/models/conditional_detr/modeling_conditional_detr.py +87 -99
  266. transformers/models/conditional_detr/modular_conditional_detr.py +3 -49
  267. transformers/models/convbert/configuration_convbert.py +8 -11
  268. transformers/models/convbert/modeling_convbert.py +87 -94
  269. transformers/models/convbert/tokenization_convbert.py +1 -0
  270. transformers/models/convnext/configuration_convnext.py +1 -0
  271. transformers/models/convnext/image_processing_convnext.py +23 -20
  272. transformers/models/convnext/image_processing_convnext_fast.py +21 -16
  273. transformers/models/convnext/modeling_convnext.py +12 -9
  274. transformers/models/convnextv2/configuration_convnextv2.py +1 -0
  275. transformers/models/convnextv2/modeling_convnextv2.py +12 -9
  276. transformers/models/cpm/tokenization_cpm.py +7 -6
  277. transformers/models/cpm/tokenization_cpm_fast.py +5 -3
  278. transformers/models/cpmant/configuration_cpmant.py +1 -4
  279. transformers/models/cpmant/modeling_cpmant.py +40 -38
  280. transformers/models/cpmant/tokenization_cpmant.py +3 -1
  281. transformers/models/csm/configuration_csm.py +66 -58
  282. transformers/models/csm/generation_csm.py +35 -31
  283. transformers/models/csm/modeling_csm.py +85 -85
  284. transformers/models/csm/modular_csm.py +58 -58
  285. transformers/models/csm/processing_csm.py +68 -25
  286. transformers/models/ctrl/configuration_ctrl.py +1 -16
  287. transformers/models/ctrl/modeling_ctrl.py +44 -54
  288. transformers/models/ctrl/tokenization_ctrl.py +1 -0
  289. transformers/models/cvt/configuration_cvt.py +1 -0
  290. transformers/models/cvt/modeling_cvt.py +16 -20
  291. transformers/models/cwm/__init__.py +1 -0
  292. transformers/models/cwm/configuration_cwm.py +12 -8
  293. transformers/models/cwm/modeling_cwm.py +39 -37
  294. transformers/models/cwm/modular_cwm.py +12 -10
  295. transformers/models/d_fine/configuration_d_fine.py +5 -7
  296. transformers/models/d_fine/modeling_d_fine.py +128 -138
  297. transformers/models/d_fine/modular_d_fine.py +18 -33
  298. transformers/models/dab_detr/configuration_dab_detr.py +3 -6
  299. transformers/models/dab_detr/modeling_dab_detr.py +75 -81
  300. transformers/models/dac/configuration_dac.py +1 -0
  301. transformers/models/dac/feature_extraction_dac.py +9 -6
  302. transformers/models/dac/modeling_dac.py +26 -24
  303. transformers/models/data2vec/configuration_data2vec_audio.py +2 -4
  304. transformers/models/data2vec/configuration_data2vec_text.py +3 -11
  305. transformers/models/data2vec/configuration_data2vec_vision.py +1 -0
  306. transformers/models/data2vec/modeling_data2vec_audio.py +56 -57
  307. transformers/models/data2vec/modeling_data2vec_text.py +93 -98
  308. transformers/models/data2vec/modeling_data2vec_vision.py +45 -49
  309. transformers/models/data2vec/modular_data2vec_audio.py +1 -6
  310. transformers/models/data2vec/modular_data2vec_text.py +54 -58
  311. transformers/models/dbrx/configuration_dbrx.py +22 -36
  312. transformers/models/dbrx/modeling_dbrx.py +45 -42
  313. transformers/models/dbrx/modular_dbrx.py +33 -31
  314. transformers/models/deberta/configuration_deberta.py +1 -6
  315. transformers/models/deberta/modeling_deberta.py +60 -64
  316. transformers/models/deberta/tokenization_deberta.py +21 -9
  317. transformers/models/deberta_v2/configuration_deberta_v2.py +1 -6
  318. transformers/models/deberta_v2/modeling_deberta_v2.py +65 -71
  319. transformers/models/deberta_v2/tokenization_deberta_v2.py +29 -11
  320. transformers/models/decision_transformer/configuration_decision_transformer.py +2 -3
  321. transformers/models/decision_transformer/modeling_decision_transformer.py +56 -60
  322. transformers/models/deepseek_v2/configuration_deepseek_v2.py +44 -39
  323. transformers/models/deepseek_v2/modeling_deepseek_v2.py +43 -43
  324. transformers/models/deepseek_v2/modular_deepseek_v2.py +49 -48
  325. transformers/models/deepseek_v3/configuration_deepseek_v3.py +45 -40
  326. transformers/models/deepseek_v3/modeling_deepseek_v3.py +42 -45
  327. transformers/models/deepseek_v3/modular_deepseek_v3.py +9 -14
  328. transformers/models/deepseek_vl/configuration_deepseek_vl.py +3 -2
  329. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +26 -25
  330. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +10 -10
  331. transformers/models/deepseek_vl/modeling_deepseek_vl.py +48 -57
  332. transformers/models/deepseek_vl/modular_deepseek_vl.py +43 -14
  333. transformers/models/deepseek_vl/processing_deepseek_vl.py +41 -10
  334. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +5 -3
  335. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +35 -35
  336. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +24 -20
  337. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +61 -109
  338. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +118 -146
  339. transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +44 -12
  340. transformers/models/deformable_detr/configuration_deformable_detr.py +3 -2
  341. transformers/models/deformable_detr/image_processing_deformable_detr.py +61 -59
  342. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +28 -28
  343. transformers/models/deformable_detr/modeling_deformable_detr.py +82 -88
  344. transformers/models/deformable_detr/modular_deformable_detr.py +3 -1
  345. transformers/models/deit/configuration_deit.py +1 -0
  346. transformers/models/deit/image_processing_deit.py +21 -18
  347. transformers/models/deit/image_processing_deit_fast.py +1 -0
  348. transformers/models/deit/modeling_deit.py +22 -24
  349. transformers/models/depth_anything/configuration_depth_anything.py +4 -2
  350. transformers/models/depth_anything/modeling_depth_anything.py +10 -10
  351. transformers/models/depth_pro/configuration_depth_pro.py +1 -0
  352. transformers/models/depth_pro/image_processing_depth_pro.py +23 -22
  353. transformers/models/depth_pro/image_processing_depth_pro_fast.py +10 -8
  354. transformers/models/depth_pro/modeling_depth_pro.py +27 -31
  355. transformers/models/detr/configuration_detr.py +2 -1
  356. transformers/models/detr/image_processing_detr.py +66 -64
  357. transformers/models/detr/image_processing_detr_fast.py +34 -33
  358. transformers/models/detr/modeling_detr.py +79 -95
  359. transformers/models/dia/configuration_dia.py +15 -9
  360. transformers/models/dia/feature_extraction_dia.py +9 -6
  361. transformers/models/dia/generation_dia.py +50 -48
  362. transformers/models/dia/modeling_dia.py +69 -78
  363. transformers/models/dia/modular_dia.py +56 -64
  364. transformers/models/dia/processing_dia.py +29 -39
  365. transformers/models/dia/tokenization_dia.py +6 -3
  366. transformers/models/diffllama/configuration_diffllama.py +30 -25
  367. transformers/models/diffllama/modeling_diffllama.py +49 -46
  368. transformers/models/diffllama/modular_diffllama.py +19 -17
  369. transformers/models/dinat/configuration_dinat.py +1 -0
  370. transformers/models/dinat/modeling_dinat.py +44 -47
  371. transformers/models/dinov2/configuration_dinov2.py +1 -0
  372. transformers/models/dinov2/modeling_dinov2.py +15 -15
  373. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +1 -1
  374. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +15 -16
  375. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +9 -9
  376. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +7 -4
  377. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +6 -3
  378. transformers/models/dinov3_vit/configuration_dinov3_vit.py +8 -5
  379. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +9 -7
  380. transformers/models/dinov3_vit/modeling_dinov3_vit.py +18 -19
  381. transformers/models/dinov3_vit/modular_dinov3_vit.py +15 -16
  382. transformers/models/distilbert/configuration_distilbert.py +2 -8
  383. transformers/models/distilbert/modeling_distilbert.py +55 -55
  384. transformers/models/distilbert/tokenization_distilbert.py +1 -13
  385. transformers/models/doge/__init__.py +1 -0
  386. transformers/models/doge/configuration_doge.py +32 -39
  387. transformers/models/doge/modeling_doge.py +49 -45
  388. transformers/models/doge/modular_doge.py +63 -71
  389. transformers/models/donut/configuration_donut_swin.py +1 -0
  390. transformers/models/donut/image_processing_donut.py +29 -26
  391. transformers/models/donut/image_processing_donut_fast.py +15 -9
  392. transformers/models/donut/modeling_donut_swin.py +58 -62
  393. transformers/models/donut/processing_donut.py +26 -5
  394. transformers/models/dots1/configuration_dots1.py +33 -41
  395. transformers/models/dots1/modeling_dots1.py +45 -54
  396. transformers/models/dots1/modular_dots1.py +4 -5
  397. transformers/models/dpr/configuration_dpr.py +2 -19
  398. transformers/models/dpr/modeling_dpr.py +39 -42
  399. transformers/models/dpr/tokenization_dpr.py +9 -19
  400. transformers/models/dpr/tokenization_dpr_fast.py +9 -7
  401. transformers/models/dpt/configuration_dpt.py +2 -1
  402. transformers/models/dpt/image_processing_dpt.py +66 -65
  403. transformers/models/dpt/image_processing_dpt_fast.py +20 -18
  404. transformers/models/dpt/modeling_dpt.py +30 -32
  405. transformers/models/dpt/modular_dpt.py +17 -15
  406. transformers/models/edgetam/configuration_edgetam.py +3 -2
  407. transformers/models/edgetam/modeling_edgetam.py +86 -86
  408. transformers/models/edgetam/modular_edgetam.py +26 -21
  409. transformers/models/edgetam_video/__init__.py +1 -0
  410. transformers/models/edgetam_video/configuration_edgetam_video.py +1 -0
  411. transformers/models/edgetam_video/modeling_edgetam_video.py +158 -169
  412. transformers/models/edgetam_video/modular_edgetam_video.py +37 -30
  413. transformers/models/efficientloftr/configuration_efficientloftr.py +5 -4
  414. transformers/models/efficientloftr/image_processing_efficientloftr.py +16 -14
  415. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +9 -9
  416. transformers/models/efficientloftr/modeling_efficientloftr.py +38 -59
  417. transformers/models/efficientloftr/modular_efficientloftr.py +3 -1
  418. transformers/models/efficientnet/configuration_efficientnet.py +1 -0
  419. transformers/models/efficientnet/image_processing_efficientnet.py +32 -28
  420. transformers/models/efficientnet/image_processing_efficientnet_fast.py +19 -17
  421. transformers/models/efficientnet/modeling_efficientnet.py +15 -19
  422. transformers/models/electra/configuration_electra.py +3 -13
  423. transformers/models/electra/modeling_electra.py +103 -108
  424. transformers/models/emu3/configuration_emu3.py +17 -13
  425. transformers/models/emu3/image_processing_emu3.py +39 -44
  426. transformers/models/emu3/modeling_emu3.py +108 -148
  427. transformers/models/emu3/modular_emu3.py +73 -115
  428. transformers/models/emu3/processing_emu3.py +43 -18
  429. transformers/models/encodec/configuration_encodec.py +4 -2
  430. transformers/models/encodec/feature_extraction_encodec.py +13 -10
  431. transformers/models/encodec/modeling_encodec.py +29 -39
  432. transformers/models/encoder_decoder/configuration_encoder_decoder.py +2 -12
  433. transformers/models/encoder_decoder/modeling_encoder_decoder.py +43 -37
  434. transformers/models/eomt/configuration_eomt.py +1 -0
  435. transformers/models/eomt/image_processing_eomt.py +56 -66
  436. transformers/models/eomt/image_processing_eomt_fast.py +33 -76
  437. transformers/models/eomt/modeling_eomt.py +18 -23
  438. transformers/models/eomt/modular_eomt.py +13 -18
  439. transformers/models/ernie/configuration_ernie.py +3 -24
  440. transformers/models/ernie/modeling_ernie.py +132 -127
  441. transformers/models/ernie/modular_ernie.py +103 -97
  442. transformers/models/ernie4_5/configuration_ernie4_5.py +27 -23
  443. transformers/models/ernie4_5/modeling_ernie4_5.py +38 -36
  444. transformers/models/ernie4_5/modular_ernie4_5.py +4 -3
  445. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +36 -32
  446. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +55 -56
  447. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +46 -18
  448. transformers/models/esm/configuration_esm.py +15 -11
  449. transformers/models/esm/modeling_esm.py +34 -38
  450. transformers/models/esm/modeling_esmfold.py +49 -53
  451. transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
  452. transformers/models/esm/openfold_utils/loss.py +2 -1
  453. transformers/models/esm/openfold_utils/protein.py +16 -15
  454. transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
  455. transformers/models/esm/tokenization_esm.py +4 -2
  456. transformers/models/evolla/configuration_evolla.py +40 -50
  457. transformers/models/evolla/modeling_evolla.py +66 -71
  458. transformers/models/evolla/modular_evolla.py +47 -53
  459. transformers/models/evolla/processing_evolla.py +35 -23
  460. transformers/models/exaone4/configuration_exaone4.py +25 -23
  461. transformers/models/exaone4/modeling_exaone4.py +38 -35
  462. transformers/models/exaone4/modular_exaone4.py +46 -44
  463. transformers/models/falcon/configuration_falcon.py +26 -31
  464. transformers/models/falcon/modeling_falcon.py +80 -82
  465. transformers/models/falcon_h1/configuration_falcon_h1.py +51 -45
  466. transformers/models/falcon_h1/modeling_falcon_h1.py +82 -85
  467. transformers/models/falcon_h1/modular_falcon_h1.py +51 -56
  468. transformers/models/falcon_mamba/configuration_falcon_mamba.py +2 -1
  469. transformers/models/falcon_mamba/modeling_falcon_mamba.py +82 -75
  470. transformers/models/falcon_mamba/modular_falcon_mamba.py +45 -28
  471. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +6 -2
  472. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +60 -76
  473. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +3 -2
  474. transformers/models/flaubert/configuration_flaubert.py +5 -10
  475. transformers/models/flaubert/modeling_flaubert.py +143 -145
  476. transformers/models/flaubert/tokenization_flaubert.py +5 -3
  477. transformers/models/flava/configuration_flava.py +6 -5
  478. transformers/models/flava/image_processing_flava.py +67 -66
  479. transformers/models/flava/image_processing_flava_fast.py +49 -46
  480. transformers/models/flava/modeling_flava.py +136 -153
  481. transformers/models/flava/processing_flava.py +12 -2
  482. transformers/models/flex_olmo/__init__.py +1 -0
  483. transformers/models/flex_olmo/configuration_flex_olmo.py +32 -28
  484. transformers/models/flex_olmo/modeling_flex_olmo.py +47 -47
  485. transformers/models/flex_olmo/modular_flex_olmo.py +44 -40
  486. transformers/models/florence2/configuration_florence2.py +1 -0
  487. transformers/models/florence2/modeling_florence2.py +69 -111
  488. transformers/models/florence2/modular_florence2.py +101 -104
  489. transformers/models/florence2/processing_florence2.py +47 -18
  490. transformers/models/fnet/configuration_fnet.py +2 -6
  491. transformers/models/fnet/modeling_fnet.py +80 -83
  492. transformers/models/fnet/tokenization_fnet.py +1 -0
  493. transformers/models/focalnet/configuration_focalnet.py +1 -0
  494. transformers/models/focalnet/modeling_focalnet.py +45 -51
  495. transformers/models/fsmt/configuration_fsmt.py +17 -12
  496. transformers/models/fsmt/modeling_fsmt.py +48 -49
  497. transformers/models/fsmt/tokenization_fsmt.py +5 -3
  498. transformers/models/funnel/configuration_funnel.py +1 -8
  499. transformers/models/funnel/modeling_funnel.py +93 -99
  500. transformers/models/funnel/tokenization_funnel.py +27 -17
  501. transformers/models/fuyu/configuration_fuyu.py +34 -28
  502. transformers/models/fuyu/image_processing_fuyu.py +31 -29
  503. transformers/models/fuyu/image_processing_fuyu_fast.py +17 -17
  504. transformers/models/fuyu/modeling_fuyu.py +53 -53
  505. transformers/models/fuyu/processing_fuyu.py +34 -23
  506. transformers/models/gemma/configuration_gemma.py +30 -25
  507. transformers/models/gemma/modeling_gemma.py +50 -46
  508. transformers/models/gemma/modular_gemma.py +47 -42
  509. transformers/models/gemma/tokenization_gemma.py +30 -10
  510. transformers/models/gemma2/configuration_gemma2.py +35 -30
  511. transformers/models/gemma2/modeling_gemma2.py +42 -39
  512. transformers/models/gemma2/modular_gemma2.py +66 -63
  513. transformers/models/gemma3/configuration_gemma3.py +44 -44
  514. transformers/models/gemma3/image_processing_gemma3.py +31 -29
  515. transformers/models/gemma3/image_processing_gemma3_fast.py +13 -11
  516. transformers/models/gemma3/modeling_gemma3.py +207 -159
  517. transformers/models/gemma3/modular_gemma3.py +204 -153
  518. transformers/models/gemma3/processing_gemma3.py +5 -5
  519. transformers/models/gemma3n/configuration_gemma3n.py +26 -36
  520. transformers/models/gemma3n/feature_extraction_gemma3n.py +11 -9
  521. transformers/models/gemma3n/modeling_gemma3n.py +356 -222
  522. transformers/models/gemma3n/modular_gemma3n.py +207 -230
  523. transformers/models/gemma3n/processing_gemma3n.py +26 -12
  524. transformers/models/git/configuration_git.py +8 -5
  525. transformers/models/git/modeling_git.py +204 -266
  526. transformers/models/git/processing_git.py +14 -2
  527. transformers/models/glm/configuration_glm.py +28 -24
  528. transformers/models/glm/modeling_glm.py +40 -37
  529. transformers/models/glm/modular_glm.py +7 -4
  530. transformers/models/glm4/configuration_glm4.py +28 -24
  531. transformers/models/glm4/modeling_glm4.py +42 -40
  532. transformers/models/glm4/modular_glm4.py +10 -8
  533. transformers/models/glm46v/configuration_glm46v.py +1 -0
  534. transformers/models/glm46v/image_processing_glm46v.py +40 -35
  535. transformers/models/glm46v/image_processing_glm46v_fast.py +9 -9
  536. transformers/models/glm46v/modeling_glm46v.py +90 -137
  537. transformers/models/glm46v/modular_glm46v.py +3 -4
  538. transformers/models/glm46v/processing_glm46v.py +41 -7
  539. transformers/models/glm46v/video_processing_glm46v.py +11 -9
  540. transformers/models/glm4_moe/configuration_glm4_moe.py +32 -40
  541. transformers/models/glm4_moe/modeling_glm4_moe.py +42 -45
  542. transformers/models/glm4_moe/modular_glm4_moe.py +34 -42
  543. transformers/models/glm4v/configuration_glm4v.py +20 -18
  544. transformers/models/glm4v/image_processing_glm4v.py +40 -34
  545. transformers/models/glm4v/image_processing_glm4v_fast.py +9 -8
  546. transformers/models/glm4v/modeling_glm4v.py +205 -254
  547. transformers/models/glm4v/modular_glm4v.py +224 -210
  548. transformers/models/glm4v/processing_glm4v.py +41 -7
  549. transformers/models/glm4v/video_processing_glm4v.py +11 -9
  550. transformers/models/glm4v_moe/configuration_glm4v_moe.py +125 -136
  551. transformers/models/glm4v_moe/modeling_glm4v_moe.py +368 -377
  552. transformers/models/glm4v_moe/modular_glm4v_moe.py +169 -83
  553. transformers/models/glpn/configuration_glpn.py +1 -0
  554. transformers/models/glpn/image_processing_glpn.py +12 -11
  555. transformers/models/glpn/image_processing_glpn_fast.py +13 -11
  556. transformers/models/glpn/modeling_glpn.py +14 -16
  557. transformers/models/got_ocr2/configuration_got_ocr2.py +12 -4
  558. transformers/models/got_ocr2/image_processing_got_ocr2.py +24 -22
  559. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +11 -9
  560. transformers/models/got_ocr2/modeling_got_ocr2.py +80 -77
  561. transformers/models/got_ocr2/modular_got_ocr2.py +51 -54
  562. transformers/models/got_ocr2/processing_got_ocr2.py +63 -42
  563. transformers/models/gpt2/configuration_gpt2.py +2 -13
  564. transformers/models/gpt2/modeling_gpt2.py +115 -120
  565. transformers/models/gpt2/tokenization_gpt2.py +46 -15
  566. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +2 -5
  567. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +89 -79
  568. transformers/models/gpt_neo/configuration_gpt_neo.py +2 -9
  569. transformers/models/gpt_neo/modeling_gpt_neo.py +67 -83
  570. transformers/models/gpt_neox/configuration_gpt_neox.py +25 -25
  571. transformers/models/gpt_neox/modeling_gpt_neox.py +75 -76
  572. transformers/models/gpt_neox/modular_gpt_neox.py +66 -67
  573. transformers/models/gpt_neox/tokenization_gpt_neox.py +51 -9
  574. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +19 -24
  575. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +47 -46
  576. transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +3 -1
  577. transformers/models/gpt_oss/configuration_gpt_oss.py +28 -46
  578. transformers/models/gpt_oss/modeling_gpt_oss.py +121 -83
  579. transformers/models/gpt_oss/modular_gpt_oss.py +103 -64
  580. transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
  581. transformers/models/gptj/configuration_gptj.py +4 -4
  582. transformers/models/gptj/modeling_gptj.py +87 -101
  583. transformers/models/granite/configuration_granite.py +33 -28
  584. transformers/models/granite/modeling_granite.py +46 -44
  585. transformers/models/granite/modular_granite.py +31 -29
  586. transformers/models/granite_speech/configuration_granite_speech.py +1 -0
  587. transformers/models/granite_speech/feature_extraction_granite_speech.py +3 -1
  588. transformers/models/granite_speech/modeling_granite_speech.py +52 -82
  589. transformers/models/granite_speech/processing_granite_speech.py +4 -11
  590. transformers/models/granitemoe/configuration_granitemoe.py +36 -31
  591. transformers/models/granitemoe/modeling_granitemoe.py +46 -41
  592. transformers/models/granitemoe/modular_granitemoe.py +27 -22
  593. transformers/models/granitemoehybrid/__init__.py +1 -0
  594. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +47 -46
  595. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +93 -97
  596. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +21 -54
  597. transformers/models/granitemoeshared/configuration_granitemoeshared.py +37 -33
  598. transformers/models/granitemoeshared/modeling_granitemoeshared.py +61 -54
  599. transformers/models/granitemoeshared/modular_granitemoeshared.py +21 -19
  600. transformers/models/grounding_dino/configuration_grounding_dino.py +4 -6
  601. transformers/models/grounding_dino/image_processing_grounding_dino.py +62 -60
  602. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +29 -28
  603. transformers/models/grounding_dino/modeling_grounding_dino.py +140 -155
  604. transformers/models/grounding_dino/modular_grounding_dino.py +3 -2
  605. transformers/models/grounding_dino/processing_grounding_dino.py +38 -10
  606. transformers/models/groupvit/configuration_groupvit.py +2 -4
  607. transformers/models/groupvit/modeling_groupvit.py +93 -107
  608. transformers/models/helium/configuration_helium.py +29 -25
  609. transformers/models/helium/modeling_helium.py +40 -38
  610. transformers/models/helium/modular_helium.py +7 -3
  611. transformers/models/herbert/tokenization_herbert.py +28 -10
  612. transformers/models/hgnet_v2/configuration_hgnet_v2.py +1 -0
  613. transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -24
  614. transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -24
  615. transformers/models/hiera/configuration_hiera.py +1 -0
  616. transformers/models/hiera/modeling_hiera.py +66 -72
  617. transformers/models/hubert/configuration_hubert.py +2 -4
  618. transformers/models/hubert/modeling_hubert.py +37 -42
  619. transformers/models/hubert/modular_hubert.py +11 -13
  620. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +31 -26
  621. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +38 -35
  622. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +6 -4
  623. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  624. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +36 -31
  625. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +42 -47
  626. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +9 -9
  627. transformers/models/ibert/configuration_ibert.py +2 -4
  628. transformers/models/ibert/modeling_ibert.py +62 -82
  629. transformers/models/ibert/quant_modules.py +1 -0
  630. transformers/models/idefics/configuration_idefics.py +8 -5
  631. transformers/models/idefics/image_processing_idefics.py +15 -13
  632. transformers/models/idefics/modeling_idefics.py +82 -75
  633. transformers/models/idefics/perceiver.py +3 -1
  634. transformers/models/idefics/processing_idefics.py +48 -32
  635. transformers/models/idefics/vision.py +25 -24
  636. transformers/models/idefics2/configuration_idefics2.py +3 -1
  637. transformers/models/idefics2/image_processing_idefics2.py +32 -31
  638. transformers/models/idefics2/image_processing_idefics2_fast.py +8 -8
  639. transformers/models/idefics2/modeling_idefics2.py +101 -127
  640. transformers/models/idefics2/processing_idefics2.py +68 -10
  641. transformers/models/idefics3/configuration_idefics3.py +4 -1
  642. transformers/models/idefics3/image_processing_idefics3.py +43 -42
  643. transformers/models/idefics3/image_processing_idefics3_fast.py +15 -40
  644. transformers/models/idefics3/modeling_idefics3.py +90 -115
  645. transformers/models/idefics3/processing_idefics3.py +69 -15
  646. transformers/models/ijepa/configuration_ijepa.py +1 -0
  647. transformers/models/ijepa/modeling_ijepa.py +11 -10
  648. transformers/models/ijepa/modular_ijepa.py +7 -5
  649. transformers/models/imagegpt/configuration_imagegpt.py +2 -9
  650. transformers/models/imagegpt/image_processing_imagegpt.py +18 -17
  651. transformers/models/imagegpt/image_processing_imagegpt_fast.py +16 -11
  652. transformers/models/imagegpt/modeling_imagegpt.py +65 -76
  653. transformers/models/informer/configuration_informer.py +9 -6
  654. transformers/models/informer/modeling_informer.py +86 -88
  655. transformers/models/informer/modular_informer.py +16 -14
  656. transformers/models/instructblip/configuration_instructblip.py +2 -2
  657. transformers/models/instructblip/modeling_instructblip.py +63 -103
  658. transformers/models/instructblip/processing_instructblip.py +36 -10
  659. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -2
  660. transformers/models/instructblipvideo/modeling_instructblipvideo.py +139 -157
  661. transformers/models/instructblipvideo/modular_instructblipvideo.py +64 -73
  662. transformers/models/instructblipvideo/processing_instructblipvideo.py +33 -14
  663. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +8 -6
  664. transformers/models/internvl/configuration_internvl.py +1 -0
  665. transformers/models/internvl/modeling_internvl.py +106 -85
  666. transformers/models/internvl/modular_internvl.py +67 -47
  667. transformers/models/internvl/processing_internvl.py +45 -12
  668. transformers/models/internvl/video_processing_internvl.py +12 -10
  669. transformers/models/jamba/configuration_jamba.py +8 -5
  670. transformers/models/jamba/modeling_jamba.py +66 -68
  671. transformers/models/jamba/modular_jamba.py +55 -54
  672. transformers/models/janus/configuration_janus.py +1 -0
  673. transformers/models/janus/image_processing_janus.py +37 -35
  674. transformers/models/janus/image_processing_janus_fast.py +20 -18
  675. transformers/models/janus/modeling_janus.py +191 -115
  676. transformers/models/janus/modular_janus.py +84 -133
  677. transformers/models/janus/processing_janus.py +43 -17
  678. transformers/models/jetmoe/configuration_jetmoe.py +26 -24
  679. transformers/models/jetmoe/modeling_jetmoe.py +46 -43
  680. transformers/models/jetmoe/modular_jetmoe.py +33 -31
  681. transformers/models/kosmos2/configuration_kosmos2.py +9 -10
  682. transformers/models/kosmos2/modeling_kosmos2.py +173 -208
  683. transformers/models/kosmos2/processing_kosmos2.py +55 -40
  684. transformers/models/kosmos2_5/__init__.py +1 -0
  685. transformers/models/kosmos2_5/configuration_kosmos2_5.py +9 -8
  686. transformers/models/kosmos2_5/image_processing_kosmos2_5.py +12 -10
  687. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +13 -4
  688. transformers/models/kosmos2_5/modeling_kosmos2_5.py +118 -132
  689. transformers/models/kosmos2_5/processing_kosmos2_5.py +29 -8
  690. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +28 -31
  691. transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +14 -12
  692. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +100 -110
  693. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +22 -28
  694. transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +8 -2
  695. transformers/models/layoutlm/configuration_layoutlm.py +2 -14
  696. transformers/models/layoutlm/modeling_layoutlm.py +72 -77
  697. transformers/models/layoutlmv2/configuration_layoutlmv2.py +17 -14
  698. transformers/models/layoutlmv2/image_processing_layoutlmv2.py +21 -18
  699. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +9 -7
  700. transformers/models/layoutlmv2/modeling_layoutlmv2.py +50 -64
  701. transformers/models/layoutlmv2/processing_layoutlmv2.py +44 -14
  702. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +126 -73
  703. transformers/models/layoutlmv3/configuration_layoutlmv3.py +19 -16
  704. transformers/models/layoutlmv3/image_processing_layoutlmv3.py +26 -24
  705. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +11 -9
  706. transformers/models/layoutlmv3/modeling_layoutlmv3.py +56 -82
  707. transformers/models/layoutlmv3/processing_layoutlmv3.py +46 -14
  708. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +134 -74
  709. transformers/models/layoutxlm/configuration_layoutxlm.py +17 -14
  710. transformers/models/layoutxlm/modular_layoutxlm.py +1 -0
  711. transformers/models/layoutxlm/processing_layoutxlm.py +44 -14
  712. transformers/models/layoutxlm/tokenization_layoutxlm.py +113 -77
  713. transformers/models/led/configuration_led.py +12 -8
  714. transformers/models/led/modeling_led.py +266 -124
  715. transformers/models/levit/configuration_levit.py +1 -0
  716. transformers/models/levit/image_processing_levit.py +21 -19
  717. transformers/models/levit/image_processing_levit_fast.py +5 -4
  718. transformers/models/levit/modeling_levit.py +19 -38
  719. transformers/models/lfm2/configuration_lfm2.py +30 -27
  720. transformers/models/lfm2/modeling_lfm2.py +50 -47
  721. transformers/models/lfm2/modular_lfm2.py +30 -29
  722. transformers/models/lfm2_moe/__init__.py +1 -0
  723. transformers/models/lfm2_moe/configuration_lfm2_moe.py +9 -6
  724. transformers/models/lfm2_moe/modeling_lfm2_moe.py +53 -61
  725. transformers/models/lfm2_moe/modular_lfm2_moe.py +37 -13
  726. transformers/models/lfm2_vl/configuration_lfm2_vl.py +1 -4
  727. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +12 -41
  728. transformers/models/lfm2_vl/modeling_lfm2_vl.py +66 -84
  729. transformers/models/lfm2_vl/modular_lfm2_vl.py +56 -70
  730. transformers/models/lfm2_vl/processing_lfm2_vl.py +76 -96
  731. transformers/models/lightglue/image_processing_lightglue.py +15 -16
  732. transformers/models/lightglue/image_processing_lightglue_fast.py +9 -9
  733. transformers/models/lightglue/modeling_lightglue.py +31 -31
  734. transformers/models/lightglue/modular_lightglue.py +28 -29
  735. transformers/models/lilt/configuration_lilt.py +2 -6
  736. transformers/models/lilt/modeling_lilt.py +70 -76
  737. transformers/models/llama/configuration_llama.py +31 -26
  738. transformers/models/llama/modeling_llama.py +39 -36
  739. transformers/models/llama/tokenization_llama.py +44 -14
  740. transformers/models/llama4/configuration_llama4.py +30 -27
  741. transformers/models/llama4/image_processing_llama4_fast.py +14 -12
  742. transformers/models/llama4/modeling_llama4.py +113 -120
  743. transformers/models/llama4/processing_llama4.py +57 -33
  744. transformers/models/llava/configuration_llava.py +1 -10
  745. transformers/models/llava/image_processing_llava.py +28 -25
  746. transformers/models/llava/image_processing_llava_fast.py +11 -9
  747. transformers/models/llava/modeling_llava.py +109 -85
  748. transformers/models/llava/processing_llava.py +51 -18
  749. transformers/models/llava_next/configuration_llava_next.py +2 -2
  750. transformers/models/llava_next/image_processing_llava_next.py +45 -43
  751. transformers/models/llava_next/image_processing_llava_next_fast.py +13 -11
  752. transformers/models/llava_next/modeling_llava_next.py +107 -110
  753. transformers/models/llava_next/processing_llava_next.py +47 -18
  754. transformers/models/llava_next_video/configuration_llava_next_video.py +7 -4
  755. transformers/models/llava_next_video/modeling_llava_next_video.py +158 -175
  756. transformers/models/llava_next_video/modular_llava_next_video.py +150 -155
  757. transformers/models/llava_next_video/processing_llava_next_video.py +63 -21
  758. transformers/models/llava_next_video/video_processing_llava_next_video.py +1 -0
  759. transformers/models/llava_onevision/configuration_llava_onevision.py +7 -4
  760. transformers/models/llava_onevision/image_processing_llava_onevision.py +42 -40
  761. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +15 -14
  762. transformers/models/llava_onevision/modeling_llava_onevision.py +169 -177
  763. transformers/models/llava_onevision/modular_llava_onevision.py +156 -163
  764. transformers/models/llava_onevision/processing_llava_onevision.py +53 -21
  765. transformers/models/llava_onevision/video_processing_llava_onevision.py +1 -0
  766. transformers/models/longcat_flash/__init__.py +1 -0
  767. transformers/models/longcat_flash/configuration_longcat_flash.py +42 -37
  768. transformers/models/longcat_flash/modeling_longcat_flash.py +36 -36
  769. transformers/models/longcat_flash/modular_longcat_flash.py +21 -21
  770. transformers/models/longformer/configuration_longformer.py +5 -5
  771. transformers/models/longformer/modeling_longformer.py +101 -105
  772. transformers/models/longt5/configuration_longt5.py +7 -9
  773. transformers/models/longt5/modeling_longt5.py +49 -49
  774. transformers/models/luke/configuration_luke.py +2 -8
  775. transformers/models/luke/modeling_luke.py +181 -188
  776. transformers/models/luke/tokenization_luke.py +140 -107
  777. transformers/models/lxmert/configuration_lxmert.py +1 -16
  778. transformers/models/lxmert/modeling_lxmert.py +74 -65
  779. transformers/models/m2m_100/configuration_m2m_100.py +9 -7
  780. transformers/models/m2m_100/modeling_m2m_100.py +71 -83
  781. transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
  782. transformers/models/mamba/configuration_mamba.py +2 -1
  783. transformers/models/mamba/modeling_mamba.py +66 -58
  784. transformers/models/mamba2/configuration_mamba2.py +8 -5
  785. transformers/models/mamba2/modeling_mamba2.py +69 -68
  786. transformers/models/marian/configuration_marian.py +5 -10
  787. transformers/models/marian/modeling_marian.py +87 -93
  788. transformers/models/marian/tokenization_marian.py +6 -6
  789. transformers/models/markuplm/configuration_markuplm.py +7 -4
  790. transformers/models/markuplm/feature_extraction_markuplm.py +2 -1
  791. transformers/models/markuplm/modeling_markuplm.py +70 -69
  792. transformers/models/markuplm/processing_markuplm.py +38 -31
  793. transformers/models/markuplm/tokenization_markuplm.py +136 -93
  794. transformers/models/mask2former/configuration_mask2former.py +8 -5
  795. transformers/models/mask2former/image_processing_mask2former.py +85 -84
  796. transformers/models/mask2former/image_processing_mask2former_fast.py +40 -37
  797. transformers/models/mask2former/modeling_mask2former.py +103 -118
  798. transformers/models/mask2former/modular_mask2former.py +8 -6
  799. transformers/models/maskformer/configuration_maskformer.py +9 -6
  800. transformers/models/maskformer/configuration_maskformer_swin.py +1 -0
  801. transformers/models/maskformer/image_processing_maskformer.py +85 -84
  802. transformers/models/maskformer/image_processing_maskformer_fast.py +40 -36
  803. transformers/models/maskformer/modeling_maskformer.py +65 -79
  804. transformers/models/maskformer/modeling_maskformer_swin.py +32 -36
  805. transformers/models/mbart/configuration_mbart.py +4 -9
  806. transformers/models/mbart/modeling_mbart.py +116 -131
  807. transformers/models/mbart/tokenization_mbart.py +54 -11
  808. transformers/models/mbart50/tokenization_mbart50.py +13 -8
  809. transformers/models/megatron_bert/configuration_megatron_bert.py +3 -13
  810. transformers/models/megatron_bert/modeling_megatron_bert.py +150 -148
  811. transformers/models/metaclip_2/configuration_metaclip_2.py +1 -4
  812. transformers/models/metaclip_2/modeling_metaclip_2.py +84 -91
  813. transformers/models/metaclip_2/modular_metaclip_2.py +45 -61
  814. transformers/models/mgp_str/configuration_mgp_str.py +1 -0
  815. transformers/models/mgp_str/modeling_mgp_str.py +18 -20
  816. transformers/models/mgp_str/processing_mgp_str.py +20 -3
  817. transformers/models/mgp_str/tokenization_mgp_str.py +3 -1
  818. transformers/models/mimi/configuration_mimi.py +40 -42
  819. transformers/models/mimi/modeling_mimi.py +113 -142
  820. transformers/models/minimax/__init__.py +1 -0
  821. transformers/models/minimax/configuration_minimax.py +43 -37
  822. transformers/models/minimax/modeling_minimax.py +51 -61
  823. transformers/models/minimax/modular_minimax.py +62 -68
  824. transformers/models/ministral/configuration_ministral.py +29 -25
  825. transformers/models/ministral/modeling_ministral.py +38 -36
  826. transformers/models/ministral/modular_ministral.py +37 -32
  827. transformers/models/ministral3/configuration_ministral3.py +27 -24
  828. transformers/models/ministral3/modeling_ministral3.py +37 -36
  829. transformers/models/ministral3/modular_ministral3.py +5 -4
  830. transformers/models/mistral/configuration_mistral.py +29 -24
  831. transformers/models/mistral/modeling_mistral.py +37 -36
  832. transformers/models/mistral/modular_mistral.py +12 -11
  833. transformers/models/mistral3/configuration_mistral3.py +1 -4
  834. transformers/models/mistral3/modeling_mistral3.py +86 -89
  835. transformers/models/mistral3/modular_mistral3.py +68 -69
  836. transformers/models/mixtral/configuration_mixtral.py +34 -29
  837. transformers/models/mixtral/modeling_mixtral.py +45 -50
  838. transformers/models/mixtral/modular_mixtral.py +31 -32
  839. transformers/models/mlcd/configuration_mlcd.py +1 -0
  840. transformers/models/mlcd/modeling_mlcd.py +14 -20
  841. transformers/models/mlcd/modular_mlcd.py +13 -17
  842. transformers/models/mllama/configuration_mllama.py +15 -10
  843. transformers/models/mllama/image_processing_mllama.py +25 -23
  844. transformers/models/mllama/image_processing_mllama_fast.py +11 -11
  845. transformers/models/mllama/modeling_mllama.py +94 -105
  846. transformers/models/mllama/processing_mllama.py +55 -6
  847. transformers/models/mluke/tokenization_mluke.py +107 -101
  848. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +3 -5
  849. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +140 -155
  850. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +3 -5
  851. transformers/models/mobilebert/configuration_mobilebert.py +2 -4
  852. transformers/models/mobilebert/modeling_mobilebert.py +85 -77
  853. transformers/models/mobilebert/tokenization_mobilebert.py +1 -0
  854. transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +1 -0
  855. transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +23 -20
  856. transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +1 -0
  857. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +16 -15
  858. transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +1 -0
  859. transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +51 -48
  860. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +15 -13
  861. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +22 -24
  862. transformers/models/mobilevit/configuration_mobilevit.py +1 -0
  863. transformers/models/mobilevit/image_processing_mobilevit.py +49 -46
  864. transformers/models/mobilevit/image_processing_mobilevit_fast.py +14 -12
  865. transformers/models/mobilevit/modeling_mobilevit.py +21 -28
  866. transformers/models/mobilevitv2/configuration_mobilevitv2.py +1 -0
  867. transformers/models/mobilevitv2/modeling_mobilevitv2.py +22 -28
  868. transformers/models/modernbert/configuration_modernbert.py +42 -44
  869. transformers/models/modernbert/modeling_modernbert.py +133 -145
  870. transformers/models/modernbert/modular_modernbert.py +170 -186
  871. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +40 -40
  872. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +57 -62
  873. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +86 -94
  874. transformers/models/moonshine/configuration_moonshine.py +31 -34
  875. transformers/models/moonshine/modeling_moonshine.py +71 -71
  876. transformers/models/moonshine/modular_moonshine.py +83 -88
  877. transformers/models/moshi/configuration_moshi.py +23 -46
  878. transformers/models/moshi/modeling_moshi.py +187 -157
  879. transformers/models/mpnet/configuration_mpnet.py +2 -6
  880. transformers/models/mpnet/modeling_mpnet.py +57 -62
  881. transformers/models/mpnet/tokenization_mpnet.py +15 -4
  882. transformers/models/mpt/configuration_mpt.py +9 -5
  883. transformers/models/mpt/modeling_mpt.py +60 -60
  884. transformers/models/mra/configuration_mra.py +2 -8
  885. transformers/models/mra/modeling_mra.py +57 -64
  886. transformers/models/mt5/configuration_mt5.py +8 -10
  887. transformers/models/mt5/modeling_mt5.py +95 -87
  888. transformers/models/musicgen/configuration_musicgen.py +8 -12
  889. transformers/models/musicgen/modeling_musicgen.py +122 -118
  890. transformers/models/musicgen/processing_musicgen.py +21 -3
  891. transformers/models/musicgen_melody/configuration_musicgen_melody.py +8 -15
  892. transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +9 -8
  893. transformers/models/musicgen_melody/modeling_musicgen_melody.py +123 -117
  894. transformers/models/musicgen_melody/processing_musicgen_melody.py +22 -3
  895. transformers/models/mvp/configuration_mvp.py +5 -8
  896. transformers/models/mvp/modeling_mvp.py +123 -135
  897. transformers/models/myt5/tokenization_myt5.py +10 -8
  898. transformers/models/nanochat/configuration_nanochat.py +8 -5
  899. transformers/models/nanochat/modeling_nanochat.py +40 -37
  900. transformers/models/nanochat/modular_nanochat.py +14 -12
  901. transformers/models/nemotron/configuration_nemotron.py +30 -25
  902. transformers/models/nemotron/modeling_nemotron.py +57 -56
  903. transformers/models/nllb/tokenization_nllb.py +28 -12
  904. transformers/models/nllb_moe/configuration_nllb_moe.py +9 -7
  905. transformers/models/nllb_moe/modeling_nllb_moe.py +69 -77
  906. transformers/models/nougat/image_processing_nougat.py +32 -29
  907. transformers/models/nougat/image_processing_nougat_fast.py +14 -12
  908. transformers/models/nougat/processing_nougat.py +39 -37
  909. transformers/models/nougat/tokenization_nougat.py +73 -18
  910. transformers/models/nystromformer/configuration_nystromformer.py +2 -8
  911. transformers/models/nystromformer/modeling_nystromformer.py +63 -74
  912. transformers/models/olmo/configuration_olmo.py +28 -23
  913. transformers/models/olmo/modeling_olmo.py +39 -36
  914. transformers/models/olmo/modular_olmo.py +11 -7
  915. transformers/models/olmo2/configuration_olmo2.py +28 -23
  916. transformers/models/olmo2/modeling_olmo2.py +41 -37
  917. transformers/models/olmo2/modular_olmo2.py +32 -29
  918. transformers/models/olmo3/__init__.py +1 -0
  919. transformers/models/olmo3/configuration_olmo3.py +30 -26
  920. transformers/models/olmo3/modeling_olmo3.py +39 -36
  921. transformers/models/olmo3/modular_olmo3.py +40 -37
  922. transformers/models/olmoe/configuration_olmoe.py +33 -29
  923. transformers/models/olmoe/modeling_olmoe.py +46 -52
  924. transformers/models/olmoe/modular_olmoe.py +15 -16
  925. transformers/models/omdet_turbo/configuration_omdet_turbo.py +4 -2
  926. transformers/models/omdet_turbo/modeling_omdet_turbo.py +47 -53
  927. transformers/models/omdet_turbo/processing_omdet_turbo.py +67 -19
  928. transformers/models/oneformer/configuration_oneformer.py +8 -5
  929. transformers/models/oneformer/image_processing_oneformer.py +84 -83
  930. transformers/models/oneformer/image_processing_oneformer_fast.py +42 -41
  931. transformers/models/oneformer/modeling_oneformer.py +171 -147
  932. transformers/models/oneformer/processing_oneformer.py +43 -28
  933. transformers/models/openai/configuration_openai.py +1 -16
  934. transformers/models/openai/modeling_openai.py +51 -65
  935. transformers/models/openai/tokenization_openai.py +47 -8
  936. transformers/models/opt/configuration_opt.py +7 -6
  937. transformers/models/opt/modeling_opt.py +76 -78
  938. transformers/models/ovis2/__init__.py +1 -0
  939. transformers/models/ovis2/configuration_ovis2.py +1 -0
  940. transformers/models/ovis2/image_processing_ovis2.py +24 -22
  941. transformers/models/ovis2/image_processing_ovis2_fast.py +11 -9
  942. transformers/models/ovis2/modeling_ovis2.py +142 -111
  943. transformers/models/ovis2/modular_ovis2.py +45 -90
  944. transformers/models/ovis2/processing_ovis2.py +40 -12
  945. transformers/models/owlv2/configuration_owlv2.py +2 -4
  946. transformers/models/owlv2/image_processing_owlv2.py +21 -20
  947. transformers/models/owlv2/image_processing_owlv2_fast.py +15 -12
  948. transformers/models/owlv2/modeling_owlv2.py +117 -133
  949. transformers/models/owlv2/modular_owlv2.py +14 -11
  950. transformers/models/owlv2/processing_owlv2.py +49 -20
  951. transformers/models/owlvit/configuration_owlvit.py +2 -4
  952. transformers/models/owlvit/image_processing_owlvit.py +22 -21
  953. transformers/models/owlvit/image_processing_owlvit_fast.py +3 -2
  954. transformers/models/owlvit/modeling_owlvit.py +116 -132
  955. transformers/models/owlvit/processing_owlvit.py +48 -20
  956. transformers/models/paligemma/configuration_paligemma.py +1 -4
  957. transformers/models/paligemma/modeling_paligemma.py +93 -103
  958. transformers/models/paligemma/processing_paligemma.py +66 -13
  959. transformers/models/parakeet/configuration_parakeet.py +14 -7
  960. transformers/models/parakeet/feature_extraction_parakeet.py +12 -10
  961. transformers/models/parakeet/modeling_parakeet.py +28 -32
  962. transformers/models/parakeet/modular_parakeet.py +20 -23
  963. transformers/models/parakeet/processing_parakeet.py +5 -13
  964. transformers/models/parakeet/{tokenization_parakeet.py → tokenization_parakeet_fast.py} +7 -5
  965. transformers/models/patchtsmixer/configuration_patchtsmixer.py +8 -5
  966. transformers/models/patchtsmixer/modeling_patchtsmixer.py +62 -70
  967. transformers/models/patchtst/configuration_patchtst.py +9 -6
  968. transformers/models/patchtst/modeling_patchtst.py +80 -97
  969. transformers/models/pegasus/configuration_pegasus.py +5 -8
  970. transformers/models/pegasus/modeling_pegasus.py +66 -72
  971. transformers/models/pegasus/tokenization_pegasus.py +45 -15
  972. transformers/models/pegasus_x/configuration_pegasus_x.py +4 -5
  973. transformers/models/pegasus_x/modeling_pegasus_x.py +52 -55
  974. transformers/models/perceiver/configuration_perceiver.py +1 -0
  975. transformers/models/perceiver/image_processing_perceiver.py +25 -22
  976. transformers/models/perceiver/image_processing_perceiver_fast.py +9 -7
  977. transformers/models/perceiver/modeling_perceiver.py +146 -165
  978. transformers/models/perceiver/tokenization_perceiver.py +6 -3
  979. transformers/models/perception_lm/configuration_perception_lm.py +1 -0
  980. transformers/models/perception_lm/image_processing_perception_lm_fast.py +10 -8
  981. transformers/models/perception_lm/modeling_perception_lm.py +70 -71
  982. transformers/models/perception_lm/modular_perception_lm.py +61 -65
  983. transformers/models/perception_lm/processing_perception_lm.py +47 -13
  984. transformers/models/perception_lm/video_processing_perception_lm.py +1 -0
  985. transformers/models/persimmon/configuration_persimmon.py +28 -23
  986. transformers/models/persimmon/modeling_persimmon.py +45 -43
  987. transformers/models/phi/configuration_phi.py +28 -23
  988. transformers/models/phi/modeling_phi.py +43 -40
  989. transformers/models/phi/modular_phi.py +24 -23
  990. transformers/models/phi3/configuration_phi3.py +33 -28
  991. transformers/models/phi3/modeling_phi3.py +38 -36
  992. transformers/models/phi3/modular_phi3.py +17 -13
  993. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +33 -30
  994. transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +9 -7
  995. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +11 -11
  996. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +78 -95
  997. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +80 -98
  998. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +44 -7
  999. transformers/models/phimoe/configuration_phimoe.py +36 -31
  1000. transformers/models/phimoe/modeling_phimoe.py +45 -50
  1001. transformers/models/phimoe/modular_phimoe.py +4 -3
  1002. transformers/models/phobert/tokenization_phobert.py +6 -4
  1003. transformers/models/pix2struct/configuration_pix2struct.py +10 -12
  1004. transformers/models/pix2struct/image_processing_pix2struct.py +19 -15
  1005. transformers/models/pix2struct/image_processing_pix2struct_fast.py +15 -12
  1006. transformers/models/pix2struct/modeling_pix2struct.py +52 -58
  1007. transformers/models/pix2struct/processing_pix2struct.py +30 -5
  1008. transformers/models/pixtral/configuration_pixtral.py +14 -11
  1009. transformers/models/pixtral/image_processing_pixtral.py +28 -26
  1010. transformers/models/pixtral/image_processing_pixtral_fast.py +11 -10
  1011. transformers/models/pixtral/modeling_pixtral.py +34 -28
  1012. transformers/models/pixtral/processing_pixtral.py +53 -21
  1013. transformers/models/plbart/configuration_plbart.py +5 -8
  1014. transformers/models/plbart/modeling_plbart.py +106 -119
  1015. transformers/models/plbart/modular_plbart.py +33 -39
  1016. transformers/models/plbart/tokenization_plbart.py +7 -4
  1017. transformers/models/poolformer/configuration_poolformer.py +1 -0
  1018. transformers/models/poolformer/image_processing_poolformer.py +24 -21
  1019. transformers/models/poolformer/image_processing_poolformer_fast.py +15 -13
  1020. transformers/models/poolformer/modeling_poolformer.py +13 -23
  1021. transformers/models/pop2piano/configuration_pop2piano.py +8 -7
  1022. transformers/models/pop2piano/feature_extraction_pop2piano.py +9 -6
  1023. transformers/models/pop2piano/modeling_pop2piano.py +24 -26
  1024. transformers/models/pop2piano/processing_pop2piano.py +33 -25
  1025. transformers/models/pop2piano/tokenization_pop2piano.py +23 -15
  1026. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +3 -3
  1027. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
  1028. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +21 -20
  1029. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +13 -16
  1030. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +13 -16
  1031. transformers/models/prophetnet/configuration_prophetnet.py +38 -37
  1032. transformers/models/prophetnet/modeling_prophetnet.py +131 -114
  1033. transformers/models/prophetnet/tokenization_prophetnet.py +16 -14
  1034. transformers/models/pvt/configuration_pvt.py +1 -0
  1035. transformers/models/pvt/image_processing_pvt.py +27 -24
  1036. transformers/models/pvt/image_processing_pvt_fast.py +2 -1
  1037. transformers/models/pvt/modeling_pvt.py +21 -21
  1038. transformers/models/pvt_v2/configuration_pvt_v2.py +4 -2
  1039. transformers/models/pvt_v2/modeling_pvt_v2.py +25 -28
  1040. transformers/models/qwen2/configuration_qwen2.py +25 -32
  1041. transformers/models/qwen2/modeling_qwen2.py +38 -36
  1042. transformers/models/qwen2/modular_qwen2.py +12 -11
  1043. transformers/models/qwen2/tokenization_qwen2.py +23 -12
  1044. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +26 -32
  1045. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +277 -340
  1046. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +211 -278
  1047. transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +49 -41
  1048. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +35 -29
  1049. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +148 -203
  1050. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +118 -93
  1051. transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +43 -7
  1052. transformers/models/qwen2_audio/configuration_qwen2_audio.py +1 -0
  1053. transformers/models/qwen2_audio/modeling_qwen2_audio.py +40 -40
  1054. transformers/models/qwen2_audio/processing_qwen2_audio.py +42 -13
  1055. transformers/models/qwen2_moe/configuration_qwen2_moe.py +35 -42
  1056. transformers/models/qwen2_moe/modeling_qwen2_moe.py +46 -51
  1057. transformers/models/qwen2_moe/modular_qwen2_moe.py +10 -7
  1058. transformers/models/qwen2_vl/configuration_qwen2_vl.py +34 -29
  1059. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +42 -41
  1060. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +15 -12
  1061. transformers/models/qwen2_vl/modeling_qwen2_vl.py +153 -199
  1062. transformers/models/qwen2_vl/processing_qwen2_vl.py +44 -7
  1063. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +18 -38
  1064. transformers/models/qwen3/configuration_qwen3.py +27 -34
  1065. transformers/models/qwen3/modeling_qwen3.py +39 -36
  1066. transformers/models/qwen3/modular_qwen3.py +6 -4
  1067. transformers/models/qwen3_moe/configuration_qwen3_moe.py +32 -39
  1068. transformers/models/qwen3_moe/modeling_qwen3_moe.py +46 -51
  1069. transformers/models/qwen3_moe/modular_qwen3_moe.py +13 -10
  1070. transformers/models/qwen3_next/configuration_qwen3_next.py +35 -45
  1071. transformers/models/qwen3_next/modeling_qwen3_next.py +51 -47
  1072. transformers/models/qwen3_next/modular_qwen3_next.py +35 -34
  1073. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +101 -135
  1074. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +252 -355
  1075. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +196 -250
  1076. transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +48 -40
  1077. transformers/models/qwen3_vl/configuration_qwen3_vl.py +29 -27
  1078. transformers/models/qwen3_vl/modeling_qwen3_vl.py +155 -233
  1079. transformers/models/qwen3_vl/modular_qwen3_vl.py +179 -206
  1080. transformers/models/qwen3_vl/processing_qwen3_vl.py +42 -6
  1081. transformers/models/qwen3_vl/video_processing_qwen3_vl.py +12 -10
  1082. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +30 -23
  1083. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +303 -358
  1084. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +124 -87
  1085. transformers/models/rag/configuration_rag.py +15 -6
  1086. transformers/models/rag/modeling_rag.py +130 -127
  1087. transformers/models/rag/retrieval_rag.py +5 -3
  1088. transformers/models/rag/tokenization_rag.py +50 -0
  1089. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +30 -29
  1090. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +42 -53
  1091. transformers/models/reformer/configuration_reformer.py +8 -7
  1092. transformers/models/reformer/modeling_reformer.py +69 -80
  1093. transformers/models/reformer/tokenization_reformer.py +31 -11
  1094. transformers/models/regnet/configuration_regnet.py +1 -0
  1095. transformers/models/regnet/modeling_regnet.py +8 -15
  1096. transformers/models/rembert/configuration_rembert.py +2 -8
  1097. transformers/models/rembert/modeling_rembert.py +111 -121
  1098. transformers/models/rembert/tokenization_rembert.py +12 -2
  1099. transformers/models/resnet/configuration_resnet.py +1 -0
  1100. transformers/models/resnet/modeling_resnet.py +13 -27
  1101. transformers/models/roberta/configuration_roberta.py +3 -11
  1102. transformers/models/roberta/modeling_roberta.py +93 -94
  1103. transformers/models/roberta/modular_roberta.py +58 -58
  1104. transformers/models/roberta/tokenization_roberta.py +29 -17
  1105. transformers/models/roberta/tokenization_roberta_old.py +4 -2
  1106. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +3 -11
  1107. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +93 -94
  1108. transformers/models/roc_bert/configuration_roc_bert.py +2 -8
  1109. transformers/models/roc_bert/modeling_roc_bert.py +121 -122
  1110. transformers/models/roc_bert/tokenization_roc_bert.py +94 -88
  1111. transformers/models/roformer/configuration_roformer.py +3 -13
  1112. transformers/models/roformer/modeling_roformer.py +81 -85
  1113. transformers/models/roformer/tokenization_roformer.py +412 -74
  1114. transformers/models/roformer/tokenization_roformer_fast.py +160 -0
  1115. transformers/models/roformer/tokenization_utils.py +1 -0
  1116. transformers/models/rt_detr/configuration_rt_detr.py +2 -1
  1117. transformers/models/rt_detr/configuration_rt_detr_resnet.py +1 -0
  1118. transformers/models/rt_detr/image_processing_rt_detr.py +55 -54
  1119. transformers/models/rt_detr/image_processing_rt_detr_fast.py +26 -26
  1120. transformers/models/rt_detr/modeling_rt_detr.py +90 -99
  1121. transformers/models/rt_detr/modeling_rt_detr_resnet.py +6 -13
  1122. transformers/models/rt_detr/modular_rt_detr.py +16 -16
  1123. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +4 -6
  1124. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +90 -101
  1125. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +12 -19
  1126. transformers/models/rwkv/configuration_rwkv.py +4 -2
  1127. transformers/models/rwkv/modeling_rwkv.py +32 -31
  1128. transformers/models/sam/configuration_sam.py +1 -3
  1129. transformers/models/sam/image_processing_sam.py +60 -59
  1130. transformers/models/sam/image_processing_sam_fast.py +27 -25
  1131. transformers/models/sam/modeling_sam.py +41 -47
  1132. transformers/models/sam/processing_sam.py +27 -39
  1133. transformers/models/sam2/configuration_sam2.py +3 -2
  1134. transformers/models/sam2/image_processing_sam2_fast.py +15 -14
  1135. transformers/models/sam2/modeling_sam2.py +90 -96
  1136. transformers/models/sam2/modular_sam2.py +91 -86
  1137. transformers/models/sam2/processing_sam2.py +47 -31
  1138. transformers/models/sam2_video/configuration_sam2_video.py +1 -0
  1139. transformers/models/sam2_video/modeling_sam2_video.py +144 -151
  1140. transformers/models/sam2_video/modular_sam2_video.py +104 -101
  1141. transformers/models/sam2_video/processing_sam2_video.py +66 -49
  1142. transformers/models/sam2_video/video_processing_sam2_video.py +4 -1
  1143. transformers/models/sam3/configuration_sam3.py +2 -21
  1144. transformers/models/sam3/image_processing_sam3_fast.py +20 -17
  1145. transformers/models/sam3/modeling_sam3.py +170 -184
  1146. transformers/models/sam3/modular_sam3.py +8 -3
  1147. transformers/models/sam3/processing_sam3.py +52 -37
  1148. transformers/models/sam3_tracker/__init__.py +1 -0
  1149. transformers/models/sam3_tracker/configuration_sam3_tracker.py +3 -1
  1150. transformers/models/sam3_tracker/modeling_sam3_tracker.py +77 -82
  1151. transformers/models/sam3_tracker/modular_sam3_tracker.py +3 -8
  1152. transformers/models/sam3_tracker/processing_sam3_tracker.py +48 -31
  1153. transformers/models/sam3_tracker_video/__init__.py +1 -0
  1154. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +1 -25
  1155. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +122 -135
  1156. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +26 -35
  1157. transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +66 -50
  1158. transformers/models/sam3_video/configuration_sam3_video.py +1 -14
  1159. transformers/models/sam3_video/modeling_sam3_video.py +34 -33
  1160. transformers/models/sam3_video/processing_sam3_video.py +46 -26
  1161. transformers/models/sam_hq/__init__.py +1 -1
  1162. transformers/models/sam_hq/configuration_sam_hq.py +1 -3
  1163. transformers/models/sam_hq/modeling_sam_hq.py +69 -74
  1164. transformers/models/sam_hq/modular_sam_hq.py +25 -23
  1165. transformers/models/sam_hq/{processing_sam_hq.py → processing_samhq.py} +29 -41
  1166. transformers/models/seamless_m4t/configuration_seamless_m4t.py +10 -8
  1167. transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +11 -8
  1168. transformers/models/seamless_m4t/modeling_seamless_m4t.py +194 -212
  1169. transformers/models/seamless_m4t/processing_seamless_m4t.py +39 -18
  1170. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +77 -40
  1171. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +10 -8
  1172. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +196 -204
  1173. transformers/models/seed_oss/configuration_seed_oss.py +32 -28
  1174. transformers/models/seed_oss/modeling_seed_oss.py +35 -33
  1175. transformers/models/seed_oss/modular_seed_oss.py +4 -3
  1176. transformers/models/segformer/configuration_segformer.py +10 -0
  1177. transformers/models/segformer/image_processing_segformer.py +42 -39
  1178. transformers/models/segformer/image_processing_segformer_fast.py +12 -10
  1179. transformers/models/segformer/modeling_segformer.py +31 -34
  1180. transformers/models/segformer/modular_segformer.py +10 -8
  1181. transformers/models/seggpt/configuration_seggpt.py +1 -0
  1182. transformers/models/seggpt/image_processing_seggpt.py +41 -38
  1183. transformers/models/seggpt/modeling_seggpt.py +38 -50
  1184. transformers/models/sew/configuration_sew.py +2 -4
  1185. transformers/models/sew/modeling_sew.py +36 -38
  1186. transformers/models/sew/modular_sew.py +13 -13
  1187. transformers/models/sew_d/configuration_sew_d.py +2 -4
  1188. transformers/models/sew_d/modeling_sew_d.py +30 -31
  1189. transformers/models/shieldgemma2/configuration_shieldgemma2.py +1 -0
  1190. transformers/models/shieldgemma2/modeling_shieldgemma2.py +17 -16
  1191. transformers/models/shieldgemma2/processing_shieldgemma2.py +5 -3
  1192. transformers/models/siglip/configuration_siglip.py +2 -4
  1193. transformers/models/siglip/image_processing_siglip.py +20 -17
  1194. transformers/models/siglip/image_processing_siglip_fast.py +1 -0
  1195. transformers/models/siglip/modeling_siglip.py +75 -84
  1196. transformers/models/siglip/processing_siglip.py +14 -2
  1197. transformers/models/siglip/tokenization_siglip.py +7 -6
  1198. transformers/models/siglip2/configuration_siglip2.py +2 -5
  1199. transformers/models/siglip2/image_processing_siglip2.py +16 -15
  1200. transformers/models/siglip2/image_processing_siglip2_fast.py +7 -6
  1201. transformers/models/siglip2/modeling_siglip2.py +129 -143
  1202. transformers/models/siglip2/modular_siglip2.py +46 -47
  1203. transformers/models/siglip2/processing_siglip2.py +14 -2
  1204. transformers/models/smollm3/configuration_smollm3.py +32 -29
  1205. transformers/models/smollm3/modeling_smollm3.py +39 -36
  1206. transformers/models/smollm3/modular_smollm3.py +35 -33
  1207. transformers/models/smolvlm/configuration_smolvlm.py +4 -2
  1208. transformers/models/smolvlm/image_processing_smolvlm.py +43 -42
  1209. transformers/models/smolvlm/image_processing_smolvlm_fast.py +15 -41
  1210. transformers/models/smolvlm/modeling_smolvlm.py +94 -126
  1211. transformers/models/smolvlm/modular_smolvlm.py +39 -50
  1212. transformers/models/smolvlm/processing_smolvlm.py +83 -15
  1213. transformers/models/smolvlm/video_processing_smolvlm.py +18 -16
  1214. transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +1 -0
  1215. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +27 -26
  1216. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -9
  1217. transformers/models/speech_to_text/feature_extraction_speech_to_text.py +13 -10
  1218. transformers/models/speech_to_text/modeling_speech_to_text.py +54 -66
  1219. transformers/models/speech_to_text/processing_speech_to_text.py +30 -4
  1220. transformers/models/speech_to_text/tokenization_speech_to_text.py +6 -5
  1221. transformers/models/speecht5/configuration_speecht5.py +9 -7
  1222. transformers/models/speecht5/feature_extraction_speecht5.py +37 -16
  1223. transformers/models/speecht5/modeling_speecht5.py +175 -213
  1224. transformers/models/speecht5/number_normalizer.py +1 -0
  1225. transformers/models/speecht5/processing_speecht5.py +37 -3
  1226. transformers/models/speecht5/tokenization_speecht5.py +5 -4
  1227. transformers/models/splinter/configuration_splinter.py +7 -6
  1228. transformers/models/splinter/modeling_splinter.py +59 -71
  1229. transformers/models/splinter/tokenization_splinter.py +30 -9
  1230. transformers/models/squeezebert/configuration_squeezebert.py +2 -14
  1231. transformers/models/squeezebert/modeling_squeezebert.py +62 -68
  1232. transformers/models/squeezebert/tokenization_squeezebert.py +1 -0
  1233. transformers/models/stablelm/configuration_stablelm.py +29 -24
  1234. transformers/models/stablelm/modeling_stablelm.py +45 -44
  1235. transformers/models/starcoder2/configuration_starcoder2.py +27 -30
  1236. transformers/models/starcoder2/modeling_starcoder2.py +41 -39
  1237. transformers/models/starcoder2/modular_starcoder2.py +16 -14
  1238. transformers/models/superglue/configuration_superglue.py +3 -7
  1239. transformers/models/superglue/image_processing_superglue.py +15 -15
  1240. transformers/models/superglue/image_processing_superglue_fast.py +10 -9
  1241. transformers/models/superglue/modeling_superglue.py +37 -42
  1242. transformers/models/superpoint/image_processing_superpoint.py +15 -15
  1243. transformers/models/superpoint/image_processing_superpoint_fast.py +11 -8
  1244. transformers/models/superpoint/modeling_superpoint.py +16 -18
  1245. transformers/models/swiftformer/configuration_swiftformer.py +1 -0
  1246. transformers/models/swiftformer/modeling_swiftformer.py +14 -18
  1247. transformers/models/swin/configuration_swin.py +1 -0
  1248. transformers/models/swin/modeling_swin.py +86 -86
  1249. transformers/models/swin2sr/configuration_swin2sr.py +1 -0
  1250. transformers/models/swin2sr/image_processing_swin2sr.py +13 -10
  1251. transformers/models/swin2sr/image_processing_swin2sr_fast.py +8 -4
  1252. transformers/models/swin2sr/modeling_swin2sr.py +63 -81
  1253. transformers/models/swinv2/configuration_swinv2.py +1 -0
  1254. transformers/models/swinv2/modeling_swinv2.py +104 -108
  1255. transformers/models/switch_transformers/configuration_switch_transformers.py +7 -11
  1256. transformers/models/switch_transformers/modeling_switch_transformers.py +44 -37
  1257. transformers/models/switch_transformers/modular_switch_transformers.py +41 -34
  1258. transformers/models/t5/configuration_t5.py +8 -14
  1259. transformers/models/t5/modeling_t5.py +92 -88
  1260. transformers/models/t5/tokenization_t5.py +9 -3
  1261. transformers/models/t5gemma/configuration_t5gemma.py +41 -43
  1262. transformers/models/t5gemma/modeling_t5gemma.py +107 -104
  1263. transformers/models/t5gemma/modular_t5gemma.py +120 -124
  1264. transformers/models/t5gemma2/configuration_t5gemma2.py +120 -80
  1265. transformers/models/t5gemma2/modeling_t5gemma2.py +125 -141
  1266. transformers/models/t5gemma2/modular_t5gemma2.py +104 -393
  1267. transformers/models/table_transformer/configuration_table_transformer.py +2 -1
  1268. transformers/models/table_transformer/modeling_table_transformer.py +49 -51
  1269. transformers/models/tapas/configuration_tapas.py +2 -12
  1270. transformers/models/tapas/modeling_tapas.py +67 -68
  1271. transformers/models/tapas/tokenization_tapas.py +153 -115
  1272. transformers/models/textnet/configuration_textnet.py +1 -0
  1273. transformers/models/textnet/image_processing_textnet.py +25 -22
  1274. transformers/models/textnet/image_processing_textnet_fast.py +10 -8
  1275. transformers/models/textnet/modeling_textnet.py +16 -28
  1276. transformers/models/time_series_transformer/configuration_time_series_transformer.py +8 -5
  1277. transformers/models/time_series_transformer/modeling_time_series_transformer.py +81 -83
  1278. transformers/models/timesfm/configuration_timesfm.py +1 -0
  1279. transformers/models/timesfm/modeling_timesfm.py +22 -33
  1280. transformers/models/timesfm/modular_timesfm.py +21 -32
  1281. transformers/models/timesformer/configuration_timesformer.py +1 -0
  1282. transformers/models/timesformer/modeling_timesformer.py +16 -15
  1283. transformers/models/timm_backbone/configuration_timm_backbone.py +1 -0
  1284. transformers/models/timm_backbone/modeling_timm_backbone.py +15 -17
  1285. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -5
  1286. transformers/models/timm_wrapper/image_processing_timm_wrapper.py +5 -4
  1287. transformers/models/timm_wrapper/modeling_timm_wrapper.py +29 -34
  1288. transformers/models/trocr/configuration_trocr.py +8 -11
  1289. transformers/models/trocr/modeling_trocr.py +44 -45
  1290. transformers/models/trocr/processing_trocr.py +25 -5
  1291. transformers/models/tvp/configuration_tvp.py +2 -5
  1292. transformers/models/tvp/image_processing_tvp.py +52 -50
  1293. transformers/models/tvp/image_processing_tvp_fast.py +15 -15
  1294. transformers/models/tvp/modeling_tvp.py +27 -27
  1295. transformers/models/tvp/processing_tvp.py +14 -2
  1296. transformers/models/udop/configuration_udop.py +7 -16
  1297. transformers/models/udop/modeling_udop.py +73 -71
  1298. transformers/models/udop/processing_udop.py +26 -7
  1299. transformers/models/udop/tokenization_udop.py +105 -84
  1300. transformers/models/umt5/configuration_umt5.py +7 -8
  1301. transformers/models/umt5/modeling_umt5.py +90 -94
  1302. transformers/models/unispeech/configuration_unispeech.py +2 -4
  1303. transformers/models/unispeech/modeling_unispeech.py +49 -51
  1304. transformers/models/unispeech/modular_unispeech.py +22 -22
  1305. transformers/models/unispeech_sat/configuration_unispeech_sat.py +2 -4
  1306. transformers/models/unispeech_sat/modeling_unispeech_sat.py +65 -69
  1307. transformers/models/unispeech_sat/modular_unispeech_sat.py +23 -23
  1308. transformers/models/univnet/feature_extraction_univnet.py +14 -14
  1309. transformers/models/univnet/modeling_univnet.py +8 -8
  1310. transformers/models/upernet/configuration_upernet.py +1 -0
  1311. transformers/models/upernet/modeling_upernet.py +13 -11
  1312. transformers/models/vaultgemma/__init__.py +1 -0
  1313. transformers/models/vaultgemma/configuration_vaultgemma.py +33 -29
  1314. transformers/models/vaultgemma/modeling_vaultgemma.py +41 -39
  1315. transformers/models/vaultgemma/modular_vaultgemma.py +31 -29
  1316. transformers/models/video_llama_3/configuration_video_llama_3.py +0 -4
  1317. transformers/models/video_llama_3/image_processing_video_llama_3.py +42 -43
  1318. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +14 -12
  1319. transformers/models/video_llama_3/modeling_video_llama_3.py +109 -157
  1320. transformers/models/video_llama_3/modular_video_llama_3.py +146 -155
  1321. transformers/models/video_llama_3/processing_video_llama_3.py +39 -5
  1322. transformers/models/video_llama_3/video_processing_video_llama_3.py +23 -42
  1323. transformers/models/video_llava/configuration_video_llava.py +1 -4
  1324. transformers/models/video_llava/image_processing_video_llava.py +38 -35
  1325. transformers/models/video_llava/modeling_video_llava.py +146 -146
  1326. transformers/models/video_llava/processing_video_llava.py +78 -38
  1327. transformers/models/video_llava/video_processing_video_llava.py +1 -0
  1328. transformers/models/videomae/configuration_videomae.py +1 -0
  1329. transformers/models/videomae/image_processing_videomae.py +34 -31
  1330. transformers/models/videomae/modeling_videomae.py +17 -14
  1331. transformers/models/videomae/video_processing_videomae.py +1 -0
  1332. transformers/models/vilt/configuration_vilt.py +4 -6
  1333. transformers/models/vilt/image_processing_vilt.py +30 -29
  1334. transformers/models/vilt/image_processing_vilt_fast.py +16 -15
  1335. transformers/models/vilt/modeling_vilt.py +90 -116
  1336. transformers/models/vilt/processing_vilt.py +14 -2
  1337. transformers/models/vipllava/configuration_vipllava.py +1 -4
  1338. transformers/models/vipllava/modeling_vipllava.py +70 -99
  1339. transformers/models/vipllava/modular_vipllava.py +54 -78
  1340. transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +1 -0
  1341. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +27 -28
  1342. transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +1 -0
  1343. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +41 -46
  1344. transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +16 -2
  1345. transformers/models/visual_bert/configuration_visual_bert.py +2 -6
  1346. transformers/models/visual_bert/modeling_visual_bert.py +92 -98
  1347. transformers/models/vit/configuration_vit.py +1 -0
  1348. transformers/models/vit/image_processing_vit.py +22 -19
  1349. transformers/models/vit/image_processing_vit_fast.py +1 -0
  1350. transformers/models/vit/modeling_vit.py +17 -17
  1351. transformers/models/vit_mae/configuration_vit_mae.py +1 -0
  1352. transformers/models/vit_mae/modeling_vit_mae.py +27 -29
  1353. transformers/models/vit_msn/configuration_vit_msn.py +1 -0
  1354. transformers/models/vit_msn/modeling_vit_msn.py +16 -18
  1355. transformers/models/vitdet/configuration_vitdet.py +1 -0
  1356. transformers/models/vitdet/modeling_vitdet.py +14 -14
  1357. transformers/models/vitmatte/configuration_vitmatte.py +5 -2
  1358. transformers/models/vitmatte/image_processing_vitmatte.py +18 -15
  1359. transformers/models/vitmatte/image_processing_vitmatte_fast.py +18 -16
  1360. transformers/models/vitmatte/modeling_vitmatte.py +11 -14
  1361. transformers/models/vitpose/configuration_vitpose.py +7 -4
  1362. transformers/models/vitpose/image_processing_vitpose.py +25 -24
  1363. transformers/models/vitpose/image_processing_vitpose_fast.py +11 -9
  1364. transformers/models/vitpose/modeling_vitpose.py +14 -14
  1365. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +1 -0
  1366. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +10 -8
  1367. transformers/models/vits/configuration_vits.py +1 -4
  1368. transformers/models/vits/modeling_vits.py +42 -44
  1369. transformers/models/vits/tokenization_vits.py +4 -3
  1370. transformers/models/vivit/configuration_vivit.py +1 -0
  1371. transformers/models/vivit/image_processing_vivit.py +39 -36
  1372. transformers/models/vivit/modeling_vivit.py +8 -6
  1373. transformers/models/vjepa2/__init__.py +1 -0
  1374. transformers/models/vjepa2/configuration_vjepa2.py +1 -0
  1375. transformers/models/vjepa2/modeling_vjepa2.py +32 -31
  1376. transformers/models/vjepa2/video_processing_vjepa2.py +1 -0
  1377. transformers/models/voxtral/__init__.py +1 -0
  1378. transformers/models/voxtral/configuration_voxtral.py +2 -0
  1379. transformers/models/voxtral/modeling_voxtral.py +47 -40
  1380. transformers/models/voxtral/modular_voxtral.py +40 -37
  1381. transformers/models/voxtral/processing_voxtral.py +48 -25
  1382. transformers/models/wav2vec2/configuration_wav2vec2.py +2 -4
  1383. transformers/models/wav2vec2/feature_extraction_wav2vec2.py +10 -7
  1384. transformers/models/wav2vec2/modeling_wav2vec2.py +121 -73
  1385. transformers/models/wav2vec2/processing_wav2vec2.py +35 -6
  1386. transformers/models/wav2vec2/tokenization_wav2vec2.py +332 -20
  1387. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +2 -4
  1388. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +62 -70
  1389. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +48 -57
  1390. transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +35 -6
  1391. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +2 -4
  1392. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +77 -90
  1393. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +30 -37
  1394. transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +17 -16
  1395. transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +55 -36
  1396. transformers/models/wavlm/configuration_wavlm.py +2 -4
  1397. transformers/models/wavlm/modeling_wavlm.py +48 -50
  1398. transformers/models/wavlm/modular_wavlm.py +5 -4
  1399. transformers/models/whisper/configuration_whisper.py +5 -6
  1400. transformers/models/whisper/english_normalizer.py +4 -3
  1401. transformers/models/whisper/feature_extraction_whisper.py +24 -9
  1402. transformers/models/whisper/generation_whisper.py +48 -26
  1403. transformers/models/whisper/modeling_whisper.py +73 -79
  1404. transformers/models/whisper/processing_whisper.py +20 -3
  1405. transformers/models/whisper/tokenization_whisper.py +43 -11
  1406. transformers/models/x_clip/configuration_x_clip.py +2 -4
  1407. transformers/models/x_clip/modeling_x_clip.py +93 -96
  1408. transformers/models/x_clip/processing_x_clip.py +14 -2
  1409. transformers/models/xcodec/configuration_xcodec.py +6 -4
  1410. transformers/models/xcodec/modeling_xcodec.py +17 -20
  1411. transformers/models/xglm/configuration_xglm.py +8 -9
  1412. transformers/models/xglm/modeling_xglm.py +55 -60
  1413. transformers/models/xglm/tokenization_xglm.py +11 -3
  1414. transformers/models/xlm/configuration_xlm.py +8 -10
  1415. transformers/models/xlm/modeling_xlm.py +144 -144
  1416. transformers/models/xlm/tokenization_xlm.py +5 -3
  1417. transformers/models/xlm_roberta/configuration_xlm_roberta.py +3 -11
  1418. transformers/models/xlm_roberta/modeling_xlm_roberta.py +194 -195
  1419. transformers/models/xlm_roberta/modular_xlm_roberta.py +53 -50
  1420. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +18 -8
  1421. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +2 -10
  1422. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +93 -94
  1423. transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +70 -67
  1424. transformers/models/xlnet/configuration_xlnet.py +12 -3
  1425. transformers/models/xlnet/modeling_xlnet.py +163 -152
  1426. transformers/models/xlnet/tokenization_xlnet.py +9 -2
  1427. transformers/models/xlstm/configuration_xlstm.py +12 -8
  1428. transformers/models/xlstm/modeling_xlstm.py +65 -62
  1429. transformers/models/xmod/configuration_xmod.py +3 -11
  1430. transformers/models/xmod/modeling_xmod.py +110 -108
  1431. transformers/models/yolos/configuration_yolos.py +1 -0
  1432. transformers/models/yolos/image_processing_yolos.py +62 -60
  1433. transformers/models/yolos/image_processing_yolos_fast.py +45 -42
  1434. transformers/models/yolos/modeling_yolos.py +16 -16
  1435. transformers/models/yolos/modular_yolos.py +19 -17
  1436. transformers/models/yoso/configuration_yoso.py +2 -8
  1437. transformers/models/yoso/modeling_yoso.py +63 -70
  1438. transformers/models/zamba/configuration_zamba.py +8 -5
  1439. transformers/models/zamba/modeling_zamba.py +78 -81
  1440. transformers/models/zamba2/configuration_zamba2.py +50 -44
  1441. transformers/models/zamba2/modeling_zamba2.py +97 -97
  1442. transformers/models/zamba2/modular_zamba2.py +48 -46
  1443. transformers/models/zoedepth/configuration_zoedepth.py +2 -1
  1444. transformers/models/zoedepth/image_processing_zoedepth.py +29 -28
  1445. transformers/models/zoedepth/image_processing_zoedepth_fast.py +24 -21
  1446. transformers/models/zoedepth/modeling_zoedepth.py +18 -26
  1447. transformers/pipelines/__init__.py +114 -57
  1448. transformers/pipelines/any_to_any.py +22 -14
  1449. transformers/pipelines/audio_utils.py +2 -1
  1450. transformers/pipelines/automatic_speech_recognition.py +12 -20
  1451. transformers/pipelines/base.py +27 -15
  1452. transformers/{models/pe_audio/processing_pe_audio.py → pipelines/deprecated/__init__.py} +3 -10
  1453. transformers/pipelines/deprecated/text2text_generation.py +408 -0
  1454. transformers/pipelines/document_question_answering.py +2 -4
  1455. transformers/pipelines/image_text_to_text.py +1 -0
  1456. transformers/pipelines/image_to_text.py +229 -0
  1457. transformers/pipelines/question_answering.py +44 -5
  1458. transformers/pipelines/text_classification.py +14 -1
  1459. transformers/pipelines/text_generation.py +1 -1
  1460. transformers/pipelines/text_to_audio.py +2 -2
  1461. transformers/pipelines/token_classification.py +22 -1
  1462. transformers/pipelines/video_classification.py +9 -1
  1463. transformers/pipelines/zero_shot_audio_classification.py +1 -0
  1464. transformers/pipelines/zero_shot_classification.py +6 -0
  1465. transformers/pipelines/zero_shot_image_classification.py +7 -0
  1466. transformers/processing_utils.py +145 -230
  1467. transformers/quantizers/auto.py +4 -2
  1468. transformers/quantizers/base.py +173 -53
  1469. transformers/quantizers/quantizer_aqlm.py +23 -2
  1470. transformers/quantizers/quantizer_auto_round.py +12 -2
  1471. transformers/quantizers/quantizer_awq.py +89 -20
  1472. transformers/quantizers/quantizer_bitnet.py +14 -4
  1473. transformers/quantizers/quantizer_bnb_4bit.py +155 -18
  1474. transformers/quantizers/quantizer_bnb_8bit.py +110 -24
  1475. transformers/quantizers/quantizer_compressed_tensors.py +9 -2
  1476. transformers/quantizers/quantizer_eetq.py +74 -16
  1477. transformers/quantizers/quantizer_fbgemm_fp8.py +138 -38
  1478. transformers/quantizers/quantizer_finegrained_fp8.py +113 -26
  1479. transformers/quantizers/quantizer_fp_quant.py +82 -52
  1480. transformers/quantizers/quantizer_gptq.py +28 -8
  1481. transformers/quantizers/quantizer_higgs.py +60 -42
  1482. transformers/quantizers/quantizer_hqq.py +153 -144
  1483. transformers/quantizers/quantizer_mxfp4.py +194 -14
  1484. transformers/quantizers/quantizer_quanto.py +79 -35
  1485. transformers/quantizers/quantizer_quark.py +18 -36
  1486. transformers/quantizers/quantizer_spqr.py +12 -4
  1487. transformers/quantizers/quantizer_torchao.py +325 -50
  1488. transformers/quantizers/quantizer_vptq.py +27 -4
  1489. transformers/quantizers/quantizers_utils.py +0 -20
  1490. transformers/safetensors_conversion.py +3 -9
  1491. transformers/testing_utils.py +82 -326
  1492. transformers/tokenization_mistral_common.py +903 -568
  1493. transformers/tokenization_utils_base.py +340 -220
  1494. transformers/tokenization_utils_sentencepiece.py +6 -5
  1495. transformers/tokenization_utils_tokenizers.py +113 -226
  1496. transformers/trainer.py +53 -60
  1497. transformers/trainer_callback.py +0 -8
  1498. transformers/trainer_seq2seq.py +1 -5
  1499. transformers/trainer_utils.py +1 -1
  1500. transformers/training_args.py +41 -77
  1501. transformers/utils/__init__.py +4 -8
  1502. transformers/utils/attention_visualizer.py +5 -5
  1503. transformers/utils/auto_docstring.py +37 -599
  1504. transformers/utils/doc.py +36 -4
  1505. transformers/utils/dummy_pt_objects.py +42 -0
  1506. transformers/utils/generic.py +28 -111
  1507. transformers/utils/hub.py +15 -5
  1508. transformers/utils/import_utils.py +32 -165
  1509. transformers/utils/kernel_config.py +19 -74
  1510. transformers/utils/loading_report.py +15 -25
  1511. transformers/utils/quantization_config.py +241 -72
  1512. transformers/video_processing_utils.py +39 -41
  1513. transformers/video_utils.py +22 -18
  1514. {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/METADATA +236 -284
  1515. transformers-5.0.0rc0.dist-info/RECORD +1987 -0
  1516. {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/WHEEL +1 -1
  1517. transformers/integrations/moe.py +0 -360
  1518. transformers/integrations/quark.py +0 -53
  1519. transformers/loss/loss_lw_detr.py +0 -356
  1520. transformers/models/ernie4_5_vl_moe/__init__.py +0 -31
  1521. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +0 -340
  1522. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +0 -455
  1523. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +0 -231
  1524. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +0 -1936
  1525. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +0 -1925
  1526. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +0 -249
  1527. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +0 -593
  1528. transformers/models/fast_vlm/__init__.py +0 -27
  1529. transformers/models/fast_vlm/configuration_fast_vlm.py +0 -137
  1530. transformers/models/fast_vlm/modeling_fast_vlm.py +0 -432
  1531. transformers/models/fast_vlm/modular_fast_vlm.py +0 -373
  1532. transformers/models/glm4_moe_lite/__init__.py +0 -28
  1533. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +0 -233
  1534. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +0 -740
  1535. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +0 -302
  1536. transformers/models/glm_image/__init__.py +0 -31
  1537. transformers/models/glm_image/configuration_glm_image.py +0 -351
  1538. transformers/models/glm_image/image_processing_glm_image.py +0 -503
  1539. transformers/models/glm_image/image_processing_glm_image_fast.py +0 -294
  1540. transformers/models/glm_image/modeling_glm_image.py +0 -1642
  1541. transformers/models/glm_image/modular_glm_image.py +0 -1531
  1542. transformers/models/glm_image/processing_glm_image.py +0 -217
  1543. transformers/models/glmasr/__init__.py +0 -29
  1544. transformers/models/glmasr/configuration_glmasr.py +0 -196
  1545. transformers/models/glmasr/modeling_glmasr.py +0 -517
  1546. transformers/models/glmasr/modular_glmasr.py +0 -443
  1547. transformers/models/glmasr/processing_glmasr.py +0 -331
  1548. transformers/models/jais2/__init__.py +0 -27
  1549. transformers/models/jais2/configuration_jais2.py +0 -148
  1550. transformers/models/jais2/modeling_jais2.py +0 -484
  1551. transformers/models/jais2/modular_jais2.py +0 -194
  1552. transformers/models/lasr/__init__.py +0 -29
  1553. transformers/models/lasr/configuration_lasr.py +0 -244
  1554. transformers/models/lasr/feature_extraction_lasr.py +0 -275
  1555. transformers/models/lasr/modeling_lasr.py +0 -727
  1556. transformers/models/lasr/modular_lasr.py +0 -574
  1557. transformers/models/lasr/processing_lasr.py +0 -100
  1558. transformers/models/lasr/tokenization_lasr.py +0 -184
  1559. transformers/models/lighton_ocr/__init__.py +0 -28
  1560. transformers/models/lighton_ocr/configuration_lighton_ocr.py +0 -128
  1561. transformers/models/lighton_ocr/modeling_lighton_ocr.py +0 -463
  1562. transformers/models/lighton_ocr/modular_lighton_ocr.py +0 -404
  1563. transformers/models/lighton_ocr/processing_lighton_ocr.py +0 -229
  1564. transformers/models/lw_detr/__init__.py +0 -27
  1565. transformers/models/lw_detr/configuration_lw_detr.py +0 -374
  1566. transformers/models/lw_detr/modeling_lw_detr.py +0 -1702
  1567. transformers/models/lw_detr/modular_lw_detr.py +0 -1615
  1568. transformers/models/minimax_m2/__init__.py +0 -28
  1569. transformers/models/minimax_m2/configuration_minimax_m2.py +0 -188
  1570. transformers/models/minimax_m2/modeling_minimax_m2.py +0 -704
  1571. transformers/models/minimax_m2/modular_minimax_m2.py +0 -346
  1572. transformers/models/paddleocr_vl/__init__.py +0 -31
  1573. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +0 -335
  1574. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +0 -503
  1575. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +0 -209
  1576. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +0 -1683
  1577. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +0 -1380
  1578. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +0 -133
  1579. transformers/models/pe_audio/__init__.py +0 -29
  1580. transformers/models/pe_audio/configuration_pe_audio.py +0 -204
  1581. transformers/models/pe_audio/feature_extraction_pe_audio.py +0 -160
  1582. transformers/models/pe_audio/modeling_pe_audio.py +0 -819
  1583. transformers/models/pe_audio/modular_pe_audio.py +0 -298
  1584. transformers/models/pe_audio_video/__init__.py +0 -28
  1585. transformers/models/pe_audio_video/configuration_pe_audio_video.py +0 -223
  1586. transformers/models/pe_audio_video/modeling_pe_audio_video.py +0 -971
  1587. transformers/models/pe_audio_video/modular_pe_audio_video.py +0 -763
  1588. transformers/models/pe_video/__init__.py +0 -29
  1589. transformers/models/pe_video/configuration_pe_video.py +0 -209
  1590. transformers/models/pe_video/modeling_pe_video.py +0 -647
  1591. transformers/models/pe_video/modular_pe_video.py +0 -231
  1592. transformers/models/pe_video/processing_pe_video.py +0 -10
  1593. transformers/models/pe_video/video_processing_pe_video.py +0 -64
  1594. transformers/models/pixio/__init__.py +0 -29
  1595. transformers/models/pixio/configuration_pixio.py +0 -150
  1596. transformers/models/pixio/modeling_pixio.py +0 -507
  1597. transformers/models/pixio/modular_pixio.py +0 -403
  1598. transformers/models/solar_open/__init__.py +0 -27
  1599. transformers/models/solar_open/configuration_solar_open.py +0 -184
  1600. transformers/models/solar_open/modeling_solar_open.py +0 -642
  1601. transformers/models/solar_open/modular_solar_open.py +0 -224
  1602. transformers/trainer_jit_checkpoint.py +0 -125
  1603. transformers-5.0.0.dist-info/RECORD +0 -2068
  1604. {transformers-5.0.0.dist-info/licenses → transformers-5.0.0rc0.dist-info}/LICENSE +0 -0
  1605. {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/entry_points.txt +0 -0
  1606. {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@
4
4
  # the file from the modular. If any change should be done, please apply the change to the
5
5
  # modular_qwen2_5_omni.py file directly. One of our CI enforces this.
6
6
  # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # coding=utf-8
7
8
  # Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
8
9
  #
9
10
  #
@@ -22,7 +23,7 @@
22
23
  import math
23
24
  from collections.abc import Callable
24
25
  from dataclasses import dataclass
25
- from typing import Any, Optional
26
+ from typing import Any, Optional, Union
26
27
 
27
28
  import numpy as np
28
29
  import torch
@@ -30,27 +31,18 @@ import torch.nn.functional as F
30
31
  from torch import nn
31
32
  from torch.nn import Parameter
32
33
 
33
- from ... import initialization as init
34
34
  from ...activations import ACT2FN
35
35
  from ...cache_utils import Cache, DynamicCache
36
36
  from ...generation import GenerationMixin
37
37
  from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
38
38
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
39
39
  from ...modeling_layers import GradientCheckpointingLayer
40
- from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, ModelOutput
40
+ from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, ModelOutput
41
41
  from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
42
42
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
43
43
  from ...processing_utils import Unpack
44
- from ...utils import (
45
- TransformersKwargs,
46
- auto_docstring,
47
- can_return_tuple,
48
- check_torch_load_is_safe,
49
- logging,
50
- torch_compilable_check,
51
- )
44
+ from ...utils import TransformersKwargs, auto_docstring, check_torch_load_is_safe, logging
52
45
  from ...utils.deprecation import deprecate_kwarg
53
- from ...utils.generic import check_model_inputs, is_flash_attention_requested, maybe_autocast
54
46
  from ...utils.hub import cached_file
55
47
  from ..qwen2.modeling_qwen2 import Qwen2RMSNorm
56
48
  from .configuration_qwen2_5_omni import (
@@ -69,52 +61,6 @@ from .configuration_qwen2_5_omni import (
69
61
  logger = logging.get_logger(__name__)
70
62
 
71
63
 
72
- def kaiser_sinc_filter1d(cutoff, half_width, kernel_size):
73
- """Generates a 1D Kaiser-windowed sinc filter.
74
-
75
- Args:
76
- cutoff (float): Normalized cutoff frequency (0 to 0.5).
77
- half_width (float): Transition bandwidth.
78
- kernel_size (int): Number of filter taps.
79
-
80
- Returns:
81
- torch.Tensor: A tensor of shape (1, 1, kernel_size) representing the filter.
82
- """
83
- is_even = kernel_size % 2 == 0
84
- half_size = kernel_size // 2
85
-
86
- # Compute Kaiser window parameters
87
- delta_f = 4 * half_width
88
- attenuation = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
89
-
90
- if attenuation > 50.0:
91
- beta = 0.1102 * (attenuation - 8.7)
92
- elif attenuation >= 21.0:
93
- beta = 0.5842 * (attenuation - 21) ** 0.4 + 0.07886 * (attenuation - 21.0)
94
- else:
95
- beta = 0.0
96
-
97
- kaiser_window = torch.kaiser_window(kernel_size, beta=beta, periodic=False, dtype=torch.float32)
98
-
99
- # Compute time indices
100
- if is_even:
101
- time_indices = torch.arange(-half_size, half_size) + 0.5
102
- else:
103
- time_indices = torch.arange(kernel_size) - half_size
104
-
105
- # Compute sinc filter
106
- if cutoff == 0:
107
- return torch.zeros((1, 1, kernel_size), dtype=torch.float32) # Ensures correct shape
108
-
109
- sinc_filter = torch.sinc(2 * cutoff * time_indices)
110
- normalized_filter = 2 * cutoff * kaiser_window * sinc_filter
111
-
112
- # Normalize to ensure sum = 1 (avoid leakage of constant component)
113
- normalized_filter /= normalized_filter.sum()
114
-
115
- return normalized_filter.view(1, 1, kernel_size)
116
-
117
-
118
64
  @auto_docstring
119
65
  class Qwen2_5OmniPreTrainedModel(PreTrainedModel):
120
66
  config: Qwen2_5OmniConfig
@@ -128,23 +74,6 @@ class Qwen2_5OmniPreTrainedModel(PreTrainedModel):
128
74
  _can_compile_fullgraph = False
129
75
  _supports_attention_backend = True
130
76
 
131
- def _init_weights(self, module):
132
- super()._init_weights(module)
133
- if isinstance(module, SinusoidsPositionEmbedding):
134
- log_timescale_increment = np.log(module.max_timescale) / (module.channels // 2 - 1)
135
- inv_timescales = torch.exp(-log_timescale_increment * torch.arange(module.channels // 2).float())
136
- scaled_time = torch.arange(module.length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
137
- init.copy_(module.positional_embedding, torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1))
138
- elif isinstance(module, UpSample1d):
139
- filter_tensor = kaiser_sinc_filter1d(0.5 / module.ratio, 0.6 / module.ratio, module.kernel_size)
140
- init.copy_(module.filter, filter_tensor)
141
- elif isinstance(module, DownSample1d):
142
- filter_tensor = kaiser_sinc_filter1d(module.cutoff, module.half_width, module.kernel_size)
143
- init.copy_(module.filter, filter_tensor)
144
- elif isinstance(module, Qwen2_5_VisionRotaryEmbedding):
145
- inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
146
- init.copy_(module.inv_freq, inv_freq)
147
-
148
77
 
149
78
  class Qwen2_5OmniPreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedModel):
150
79
  input_modalities = ("image", "video", "audio", "text")
@@ -263,13 +192,13 @@ class Qwen2_5OmniPreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedMo
263
192
 
264
193
  def get_rope_index(
265
194
  self,
266
- input_ids: torch.LongTensor | None = None,
267
- image_grid_thw: torch.LongTensor | None = None,
268
- video_grid_thw: torch.LongTensor | None = None,
269
- attention_mask: torch.Tensor | None = None,
195
+ input_ids: Optional[torch.LongTensor] = None,
196
+ image_grid_thw: Optional[torch.LongTensor] = None,
197
+ video_grid_thw: Optional[torch.LongTensor] = None,
198
+ attention_mask: Optional[torch.Tensor] = None,
270
199
  use_audio_in_video: bool = False,
271
- audio_seqlens: torch.LongTensor | None = None,
272
- second_per_grids: torch.Tensor | None = None,
200
+ audio_seqlens: Optional[torch.LongTensor] = None,
201
+ second_per_grids: Optional[torch.Tensor] = None,
273
202
  ) -> tuple[torch.Tensor, torch.Tensor]:
274
203
  """
275
204
  Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
@@ -577,12 +506,12 @@ class Qwen2_5OmniThinkerCausalLMOutputWithPast(ModelOutput):
577
506
  The rope index difference between sequence length and multimodal rope.
578
507
  """
579
508
 
580
- loss: torch.FloatTensor | None = None
581
- logits: torch.FloatTensor | None = None
582
- past_key_values: Cache | None = None
583
- hidden_states: tuple[torch.FloatTensor] | None = None
584
- attentions: tuple[torch.FloatTensor] | None = None
585
- rope_deltas: torch.LongTensor | None = None
509
+ loss: Optional[torch.FloatTensor] = None
510
+ logits: Optional[torch.FloatTensor] = None
511
+ past_key_values: Optional[Cache] = None
512
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
513
+ attentions: Optional[tuple[torch.FloatTensor]] = None
514
+ rope_deltas: Optional[torch.LongTensor] = None
586
515
 
587
516
 
588
517
  def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -602,7 +531,7 @@ def eager_attention_forward(
602
531
  query: torch.Tensor,
603
532
  key: torch.Tensor,
604
533
  value: torch.Tensor,
605
- attention_mask: torch.Tensor | None,
534
+ attention_mask: Optional[torch.Tensor],
606
535
  scaling: float,
607
536
  dropout: float = 0.0,
608
537
  **kwargs,
@@ -656,10 +585,10 @@ class Qwen2_5OmniAudioAttention(nn.Module):
656
585
  def forward(
657
586
  self,
658
587
  hidden_states: torch.Tensor,
659
- cu_seqlens: torch.Tensor | None = None,
660
- attention_mask: torch.Tensor | None = None,
588
+ cu_seqlens: Optional[torch.Tensor] = None,
589
+ attention_mask: Optional[torch.Tensor] = None,
661
590
  **kwargs,
662
- ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
591
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
663
592
  """Input shape: Batch x Time x Channel"""
664
593
 
665
594
  seq_length, _ = hidden_states.size()
@@ -716,7 +645,7 @@ class Qwen2_5OmniAudioEncoderLayer(GradientCheckpointingLayer):
716
645
  self,
717
646
  hidden_states: torch.Tensor,
718
647
  cu_seqlens: torch.Tensor,
719
- attention_mask: torch.Tensor | None = None,
648
+ attention_mask: Optional[torch.Tensor] = None,
720
649
  **kwargs,
721
650
  ) -> torch.Tensor:
722
651
  """
@@ -756,9 +685,6 @@ class Qwen2_5OmniAudioEncoderLayer(GradientCheckpointingLayer):
756
685
  class SinusoidsPositionEmbedding(nn.Module):
757
686
  def __init__(self, length, channels, max_timescale=10000):
758
687
  super().__init__()
759
- self.length = length
760
- self.channels = channels
761
- self.max_timescale = max_timescale
762
688
  if channels % 2 != 0:
763
689
  raise ValueError("SinusoidsPositionEmbedding needs even channels input")
764
690
  log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
@@ -786,10 +712,6 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel):
786
712
  input_modalities = "audio"
787
713
  _no_split_modules = ["Qwen2_5OmniAudioEncoderLayer"]
788
714
  _supports_sdpa = True
789
- _can_record_outputs = {
790
- "hidden_states": Qwen2_5OmniAudioEncoderLayer,
791
- "attentions": Qwen2_5OmniAudioAttention,
792
- }
793
715
 
794
716
  def __init__(self, config: Qwen2_5OmniAudioEncoderConfig):
795
717
  super().__init__(config)
@@ -828,7 +750,7 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel):
828
750
  # NOTE: the created attention masl only approximates the ragged FA2 attention by
829
751
  # allowing bidirectional attention within `cu_seqlens` blocks, and not attending between
830
752
  # blocks. Though it will not be a 100% match for FA2's `varlen` path
831
- if is_flash_attention_requested(self.config):
753
+ if self.config._attn_implementation == "flash_attention_2":
832
754
  return None
833
755
 
834
756
  seq_length = inputs_tensor.shape[0]
@@ -842,9 +764,14 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel):
842
764
  attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
843
765
  return attention_mask
844
766
 
845
- @check_model_inputs(tie_last_hidden_states=False)
846
767
  @auto_docstring
847
- def forward(self, input_features, feature_lens=None, aftercnn_lens=None, **kwargs: Unpack[TransformersKwargs]):
768
+ def forward(
769
+ self,
770
+ input_features,
771
+ feature_lens=None,
772
+ aftercnn_lens=None,
773
+ **kwargs,
774
+ ):
848
775
  r"""
849
776
  feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
850
777
  mel length
@@ -853,7 +780,11 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel):
853
780
  """
854
781
  chunk_num = torch.ceil(feature_lens / (self.n_window * 2)).long()
855
782
 
856
- chunk_lengths = torch.full((chunk_num.sum(),), self.n_window * 2, dtype=torch.long, device=feature_lens.device)
783
+ chunk_lengths = torch.tensor(
784
+ [self.n_window * 2] * chunk_num.sum(),
785
+ dtype=torch.long,
786
+ device=feature_lens.device,
787
+ )
857
788
  tail_chunk_index = F.pad(chunk_num, (1, 0), value=-1).cumsum(0)[1:]
858
789
  chunk_lengths[tail_chunk_index] = feature_lens % (self.n_window * 2)
859
790
  chunk_lengths = torch.where(chunk_lengths == 0, self.n_window * 2, chunk_lengths)
@@ -894,7 +825,7 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel):
894
825
  each_audio_states = self.proj(each_audio_states)
895
826
  token_audio_list.append(each_audio_states)
896
827
  token_audio = torch.cat(token_audio_list, dim=0)
897
- return BaseModelOutputWithPooling(last_hidden_state=token_audio)
828
+ return BaseModelOutput(last_hidden_state=token_audio)
898
829
 
899
830
  def padded_and_mask_function(self, tensor_list, tensor_len, padding_value=0, padding_side="right"):
900
831
  """
@@ -983,7 +914,7 @@ class Qwen2_5OmniVisionAttention(nn.Module):
983
914
  self,
984
915
  hidden_states: torch.Tensor,
985
916
  cu_seqlens: torch.Tensor,
986
- rotary_pos_emb: torch.Tensor | None = None,
917
+ rotary_pos_emb: Optional[torch.Tensor] = None,
987
918
  **kwargs,
988
919
  ) -> torch.Tensor:
989
920
  seq_length = hidden_states.shape[0]
@@ -1001,7 +932,7 @@ class Qwen2_5OmniVisionAttention(nn.Module):
1001
932
  if self.config._attn_implementation != "eager":
1002
933
  attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
1003
934
 
1004
- if is_flash_attention_requested(self.config):
935
+ if self.config._attn_implementation == "flash_attention_2":
1005
936
  # Flash Attention 2: Use cu_seqlens for variable length attention
1006
937
  max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
1007
938
  attn_output, _ = attention_interface(
@@ -1073,7 +1004,7 @@ class Qwen2_5OmniVisionBlock(GradientCheckpointingLayer):
1073
1004
  self,
1074
1005
  hidden_states: torch.Tensor,
1075
1006
  cu_seqlens: torch.Tensor,
1076
- rotary_pos_emb: torch.Tensor | None = None,
1007
+ rotary_pos_emb: Optional[torch.Tensor] = None,
1077
1008
  **kwargs,
1078
1009
  ) -> torch.Tensor:
1079
1010
  hidden_states = hidden_states + self.attn(
@@ -1086,22 +1017,6 @@ class Qwen2_5OmniVisionBlock(GradientCheckpointingLayer):
1086
1017
  return hidden_states
1087
1018
 
1088
1019
 
1089
- class Qwen2_5_VisionRotaryEmbedding(nn.Module):
1090
- inv_freq: torch.Tensor # fix linting for `register_buffer`
1091
-
1092
- def __init__(self, dim: int, theta: float = 10000.0) -> None:
1093
- super().__init__()
1094
- self.dim = dim
1095
- self.theta = theta
1096
- inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
1097
- self.register_buffer("inv_freq", inv_freq, persistent=False)
1098
-
1099
- def forward(self, seqlen: int) -> torch.Tensor:
1100
- seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
1101
- freqs = torch.outer(seq, self.inv_freq)
1102
- return freqs
1103
-
1104
-
1105
1020
  class Qwen2_5_VisionPatchEmbed(nn.Module):
1106
1021
  def __init__(
1107
1022
  self,
@@ -1128,6 +1043,20 @@ class Qwen2_5_VisionPatchEmbed(nn.Module):
1128
1043
  return hidden_states
1129
1044
 
1130
1045
 
1046
+ class Qwen2_5_VisionRotaryEmbedding(nn.Module):
1047
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
1048
+
1049
+ def __init__(self, dim: int, theta: float = 10000.0) -> None:
1050
+ super().__init__()
1051
+ inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
1052
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
1053
+
1054
+ def forward(self, seqlen: int) -> torch.Tensor:
1055
+ seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
1056
+ freqs = torch.outer(seq, self.inv_freq)
1057
+ return freqs
1058
+
1059
+
1131
1060
  class Qwen2_5OmniPatchMerger(nn.Module):
1132
1061
  def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
1133
1062
  super().__init__()
@@ -1148,10 +1077,6 @@ class Qwen2_5OmniVisionEncoder(Qwen2_5OmniPreTrainedModel):
1148
1077
  config: Qwen2_5OmniVisionEncoderConfig
1149
1078
  _no_split_modules = ["Qwen2_5OmniVisionBlock"]
1150
1079
  _input_embed_layer = "patch_embed"
1151
- _can_record_outputs = {
1152
- "hidden_states": Qwen2_5OmniVisionBlock,
1153
- "attentions": Qwen2_5OmniVisionAttention,
1154
- }
1155
1080
  input_modalities = ("image", "video")
1156
1081
 
1157
1082
  def __init__(self, config: Qwen2_5OmniVisionEncoderConfig, *inputs, **kwargs) -> None:
@@ -1179,8 +1104,6 @@ class Qwen2_5OmniVisionEncoder(Qwen2_5OmniPreTrainedModel):
1179
1104
  )
1180
1105
  self.gradient_checkpointing = False
1181
1106
 
1182
- self.post_init()
1183
-
1184
1107
  def rot_pos_emb(self, grid_thw):
1185
1108
  pos_ids = []
1186
1109
  for t, h, w in grid_thw:
@@ -1251,10 +1174,7 @@ class Qwen2_5OmniVisionEncoder(Qwen2_5OmniPreTrainedModel):
1251
1174
 
1252
1175
  return window_index, cu_window_seqlens
1253
1176
 
1254
- @check_model_inputs
1255
- def forward(
1256
- self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
1257
- ) -> tuple | BaseModelOutputWithPooling:
1177
+ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
1258
1178
  """
1259
1179
  Args:
1260
1180
  hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
@@ -1307,15 +1227,11 @@ class Qwen2_5OmniVisionEncoder(Qwen2_5OmniPreTrainedModel):
1307
1227
  rotary_pos_emb=rotary_pos_emb,
1308
1228
  **kwargs,
1309
1229
  )
1310
-
1311
- merged_hidden_states = self.merger(hidden_states)
1230
+ hidden_states = self.merger(hidden_states)
1312
1231
  reverse_indices = torch.argsort(window_index)
1313
- merged_hidden_states = merged_hidden_states[reverse_indices, :]
1232
+ hidden_states = hidden_states[reverse_indices, :]
1314
1233
 
1315
- return BaseModelOutputWithPooling(
1316
- last_hidden_state=hidden_states,
1317
- pooler_output=merged_hidden_states,
1318
- )
1234
+ return hidden_states
1319
1235
 
1320
1236
 
1321
1237
  class Qwen2_5OmniRotaryEmbedding(nn.Module):
@@ -1335,13 +1251,13 @@ class Qwen2_5OmniRotaryEmbedding(nn.Module):
1335
1251
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
1336
1252
 
1337
1253
  self.register_buffer("inv_freq", inv_freq, persistent=False)
1338
- self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
1254
+ self.original_inv_freq = inv_freq
1339
1255
 
1340
1256
  @staticmethod
1341
1257
  def compute_default_rope_parameters(
1342
- config: Qwen2_5OmniConfig | None = None,
1258
+ config: Optional[Qwen2_5OmniConfig] = None,
1343
1259
  device: Optional["torch.device"] = None,
1344
- seq_len: int | None = None,
1260
+ seq_len: Optional[int] = None,
1345
1261
  ) -> tuple["torch.Tensor", float]:
1346
1262
  """
1347
1263
  Computes the inverse frequencies according to the original RoPE implementation
@@ -1375,7 +1291,7 @@ class Qwen2_5OmniRotaryEmbedding(nn.Module):
1375
1291
  position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
1376
1292
 
1377
1293
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
1378
- with maybe_autocast(device_type=device_type, enabled=False): # Force float32
1294
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
1379
1295
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
1380
1296
  emb = torch.cat((freqs, freqs), dim=-1)
1381
1297
  cos = emb.cos() * self.attention_scaling
@@ -1435,7 +1351,7 @@ class Qwen2_5OmniAttention(nn.Module):
1435
1351
  and "Generating Long Sequences with Sparse Transformers".
1436
1352
  """
1437
1353
 
1438
- def __init__(self, config: Qwen2_5OmniConfig, layer_idx: int | None = None):
1354
+ def __init__(self, config: Qwen2_5OmniConfig, layer_idx: Optional[int] = None):
1439
1355
  super().__init__()
1440
1356
  self.config = config
1441
1357
  self.layer_idx = layer_idx
@@ -1466,15 +1382,15 @@ class Qwen2_5OmniAttention(nn.Module):
1466
1382
  def forward(
1467
1383
  self,
1468
1384
  hidden_states: torch.Tensor,
1469
- attention_mask: torch.Tensor | None = None,
1470
- position_ids: torch.LongTensor | None = None,
1471
- past_key_values: Cache | None = None,
1385
+ attention_mask: Optional[torch.Tensor] = None,
1386
+ position_ids: Optional[torch.LongTensor] = None,
1387
+ past_key_values: Optional[Cache] = None,
1472
1388
  output_attentions: bool = False,
1473
1389
  use_cache: bool = False,
1474
- cache_position: torch.LongTensor | None = None,
1475
- position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
1390
+ cache_position: Optional[torch.LongTensor] = None,
1391
+ position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
1476
1392
  **kwargs: Unpack[FlashAttentionKwargs],
1477
- ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
1393
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
1478
1394
  bsz, q_len, _ = hidden_states.size()
1479
1395
 
1480
1396
  query_states = self.q_proj(hidden_states)
@@ -1535,7 +1451,7 @@ class Qwen2_5OmniDecoderLayer(GradientCheckpointingLayer):
1535
1451
  super().__init__()
1536
1452
  self.hidden_size = config.hidden_size
1537
1453
 
1538
- if config.use_sliding_window and not is_flash_attention_requested(config):
1454
+ if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
1539
1455
  logger.warning_once(
1540
1456
  f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
1541
1457
  "unexpected results may be encountered."
@@ -1550,15 +1466,15 @@ class Qwen2_5OmniDecoderLayer(GradientCheckpointingLayer):
1550
1466
  def forward(
1551
1467
  self,
1552
1468
  hidden_states: torch.Tensor,
1553
- attention_mask: torch.Tensor | None = None,
1554
- position_ids: torch.LongTensor | None = None,
1555
- past_key_values: Cache | None = None,
1556
- output_attentions: bool | None = False,
1557
- use_cache: bool | None = False,
1558
- cache_position: torch.LongTensor | None = None,
1559
- position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
1469
+ attention_mask: Optional[torch.Tensor] = None,
1470
+ position_ids: Optional[torch.LongTensor] = None,
1471
+ past_key_values: Optional[Cache] = None,
1472
+ output_attentions: Optional[bool] = False,
1473
+ use_cache: Optional[bool] = False,
1474
+ cache_position: Optional[torch.LongTensor] = None,
1475
+ position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
1560
1476
  **kwargs: Unpack[FlashAttentionKwargs],
1561
- ) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]:
1477
+ ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
1562
1478
  """
1563
1479
  Args:
1564
1480
  hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@@ -1640,18 +1556,18 @@ class Qwen2_5OmniThinkerTextModel(Qwen2_5OmniPreTrainedModel):
1640
1556
  @auto_docstring
1641
1557
  def forward(
1642
1558
  self,
1643
- input_ids: torch.LongTensor | None = None,
1644
- attention_mask: torch.Tensor | None = None,
1645
- position_ids: torch.LongTensor | None = None,
1646
- past_key_values: Cache | None = None,
1647
- inputs_embeds: torch.FloatTensor | None = None,
1648
- use_cache: bool | None = None,
1649
- output_attentions: bool | None = None,
1650
- output_hidden_states: bool | None = None,
1651
- return_dict: bool | None = None,
1652
- cache_position: torch.LongTensor | None = None,
1559
+ input_ids: Optional[torch.LongTensor] = None,
1560
+ attention_mask: Optional[torch.Tensor] = None,
1561
+ position_ids: Optional[torch.LongTensor] = None,
1562
+ past_key_values: Optional[Cache] = None,
1563
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1564
+ use_cache: Optional[bool] = None,
1565
+ output_attentions: Optional[bool] = None,
1566
+ output_hidden_states: Optional[bool] = None,
1567
+ return_dict: Optional[bool] = None,
1568
+ cache_position: Optional[torch.LongTensor] = None,
1653
1569
  **kwargs: Unpack[FlashAttentionKwargs],
1654
- ) -> tuple | BaseModelOutputWithPast:
1570
+ ) -> Union[tuple, BaseModelOutputWithPast]:
1655
1571
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1656
1572
  output_hidden_states = (
1657
1573
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1789,6 +1705,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
1789
1705
  self.vocab_size = config.text_config.vocab_size
1790
1706
  self.model = Qwen2_5OmniThinkerTextModel._from_config(config.text_config)
1791
1707
  self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
1708
+ self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
1792
1709
  self.spatial_merge_size = config.vision_config.spatial_merge_size
1793
1710
  self.rope_deltas = None
1794
1711
  self.post_init()
@@ -1799,56 +1716,52 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
1799
1716
  def set_input_embeddings(self, value):
1800
1717
  self.model.set_input_embeddings(value)
1801
1718
 
1802
- @can_return_tuple
1803
- @auto_docstring
1804
1719
  def get_video_features(
1805
- self,
1806
- pixel_values_videos: torch.FloatTensor,
1807
- video_grid_thw: torch.LongTensor | None = None,
1808
- **kwargs: Unpack[TransformersKwargs],
1809
- ) -> tuple | BaseModelOutputWithPooling:
1810
- r"""
1811
- pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1812
- The tensors corresponding to the input videos.
1813
- video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1814
- The temporal, height and width of feature shape of each video in LLM.
1720
+ self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
1721
+ ):
1722
+ """
1723
+ Encodes videos into continuous embeddings that can be forwarded to the language model.
1724
+
1725
+ Args:
1726
+ pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1727
+ The tensors corresponding to the input videos.
1728
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1729
+ The temporal, height and width of feature shape of each video in LLM.
1815
1730
  """
1816
1731
  pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
1817
- return self.visual(pixel_values_videos, grid_thw=video_grid_thw, **kwargs)
1732
+ video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
1733
+ return video_embeds
1818
1734
 
1819
- @can_return_tuple
1820
- @auto_docstring
1821
- def get_image_features(
1822
- self,
1823
- pixel_values: torch.FloatTensor,
1824
- image_grid_thw: torch.LongTensor | None = None,
1825
- **kwargs: Unpack[TransformersKwargs],
1826
- ) -> tuple | BaseModelOutputWithPooling:
1827
- r"""
1828
- pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1829
- The tensors corresponding to the input images.
1830
- image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1831
- The temporal, height and width of feature shape of each image in LLM.
1735
+ def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
1736
+ """
1737
+ Encodes images into continuous embeddings that can be forwarded to the language model.
1738
+
1739
+ Args:
1740
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1741
+ The tensors corresponding to the input images.
1742
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1743
+ The temporal, height and width of feature shape of each image in LLM.
1832
1744
  """
1833
1745
  pixel_values = pixel_values.type(self.visual.dtype)
1834
- return self.visual(pixel_values, grid_thw=image_grid_thw, **kwargs)
1746
+ image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
1747
+ return image_embeds
1835
1748
 
1836
- @can_return_tuple
1837
- @auto_docstring
1838
1749
  def get_audio_features(
1839
1750
  self,
1840
1751
  input_features: torch.FloatTensor,
1841
- feature_attention_mask: torch.LongTensor | None = None,
1842
- audio_feature_lengths: torch.LongTensor | None = None,
1843
- **kwargs: Unpack[TransformersKwargs],
1844
- ) -> tuple | BaseModelOutputWithPooling:
1845
- r"""
1846
- input_features (`torch.FloatTensor`):
1847
- The tensors corresponding to the input audios.
1848
- feature_attention_mask (`torch.LongTensor`, *optional*):
1849
- Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
1850
- audio_feature_lengths (`torch.LongTensor` of shape `(num_audios)`, *optional*):
1851
- The length of feature shape of each audio in LLM.
1752
+ feature_attention_mask: Optional[torch.LongTensor] = None,
1753
+ audio_feature_lengths: Optional[torch.LongTensor] = None,
1754
+ ):
1755
+ """
1756
+ Encodes audios into continuous embeddings that can be forwarded to the language model.
1757
+
1758
+ Args:
1759
+ input_features (`torch.FloatTensor`):
1760
+ The tensors corresponding to the input audios.
1761
+ feature_attention_mask (`torch.LongTensor`, *optional*):
1762
+ Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
1763
+ audio_feature_lengths (`torch.LongTensor` of shape `(num_audios)`, *optional*):
1764
+ The length of feature shape of each audio in LLM.
1852
1765
  """
1853
1766
  if feature_attention_mask is not None:
1854
1767
  audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
@@ -1864,20 +1777,20 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
1864
1777
  input_features,
1865
1778
  feature_lens=feature_lens,
1866
1779
  aftercnn_lens=audio_feat_lengths,
1867
- return_dict=True,
1868
- **kwargs,
1869
1780
  )
1870
- if audio_outputs.last_hidden_state.shape[0] != sum(audio_output_lengths.tolist()):
1781
+ audio_features = audio_outputs.last_hidden_state
1782
+
1783
+ if audio_features.shape[0] != sum(audio_output_lengths.tolist()):
1871
1784
  raise ValueError("length of audio_features should match audio_output_lengths")
1872
1785
 
1873
- return audio_outputs
1786
+ return audio_features
1874
1787
 
1875
1788
  def get_placeholder_mask(
1876
1789
  self,
1877
1790
  input_ids: torch.LongTensor,
1878
1791
  inputs_embeds: torch.FloatTensor,
1879
- image_features: torch.FloatTensor | None = None,
1880
- video_features: torch.FloatTensor | None = None,
1792
+ image_features: Optional[torch.FloatTensor] = None,
1793
+ video_features: Optional[torch.FloatTensor] = None,
1881
1794
  ):
1882
1795
  """
1883
1796
  Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
@@ -1905,18 +1818,16 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
1905
1818
 
1906
1819
  n_image_tokens = special_image_mask.sum()
1907
1820
  special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
1908
- if image_features is not None:
1909
- torch_compilable_check(
1910
- inputs_embeds[special_image_mask].numel() == image_features.numel(),
1911
- f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
1821
+ if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
1822
+ raise ValueError(
1823
+ f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
1912
1824
  )
1913
1825
 
1914
1826
  n_video_tokens = special_video_mask.sum()
1915
1827
  special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
1916
- if video_features is not None:
1917
- torch_compilable_check(
1918
- inputs_embeds[special_video_mask].numel() == video_features.numel(),
1919
- f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
1828
+ if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
1829
+ raise ValueError(
1830
+ f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}"
1920
1831
  )
1921
1832
 
1922
1833
  special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
@@ -1925,29 +1836,29 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
1925
1836
  @auto_docstring
1926
1837
  def forward(
1927
1838
  self,
1928
- input_ids: torch.LongTensor | None = None,
1929
- input_features: torch.FloatTensor | None = None,
1930
- pixel_values: torch.FloatTensor | None = None,
1931
- pixel_values_videos: torch.FloatTensor | None = None,
1932
- image_grid_thw: torch.LongTensor | None = None,
1933
- video_grid_thw: torch.LongTensor | None = None,
1934
- attention_mask: torch.Tensor | None = None,
1935
- feature_attention_mask: torch.Tensor | None = None,
1936
- audio_feature_lengths: torch.LongTensor | None = None,
1937
- position_ids: torch.LongTensor | None = None,
1938
- past_key_values: Cache | None = None,
1939
- inputs_embeds: torch.FloatTensor | None = None,
1940
- rope_deltas: torch.LongTensor | None = None,
1941
- labels: torch.LongTensor | None = None,
1942
- use_cache: bool | None = None,
1943
- output_attentions: bool | None = None,
1944
- output_hidden_states: bool | None = None,
1945
- return_dict: bool | None = None,
1946
- use_audio_in_video: bool | None = None,
1947
- cache_position: torch.LongTensor | None = None,
1948
- video_second_per_grid: torch.LongTensor | None = None,
1839
+ input_ids: Optional[torch.LongTensor] = None,
1840
+ input_features: Optional[torch.FloatTensor] = None,
1841
+ pixel_values: Optional[torch.FloatTensor] = None,
1842
+ pixel_values_videos: Optional[torch.FloatTensor] = None,
1843
+ image_grid_thw: Optional[torch.LongTensor] = None,
1844
+ video_grid_thw: Optional[torch.LongTensor] = None,
1845
+ attention_mask: Optional[torch.Tensor] = None,
1846
+ feature_attention_mask: Optional[torch.Tensor] = None,
1847
+ audio_feature_lengths: Optional[torch.LongTensor] = None,
1848
+ position_ids: Optional[torch.LongTensor] = None,
1849
+ past_key_values: Optional[Cache] = None,
1850
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1851
+ rope_deltas: Optional[torch.LongTensor] = None,
1852
+ labels: Optional[torch.LongTensor] = None,
1853
+ use_cache: Optional[bool] = None,
1854
+ output_attentions: Optional[bool] = None,
1855
+ output_hidden_states: Optional[bool] = None,
1856
+ return_dict: Optional[bool] = None,
1857
+ use_audio_in_video: Optional[bool] = None,
1858
+ cache_position: Optional[torch.LongTensor] = None,
1859
+ video_second_per_grid: Optional[torch.LongTensor] = None,
1949
1860
  **kwargs: Unpack[TransformersKwargs],
1950
- ) -> tuple | Qwen2_5OmniThinkerCausalLMOutputWithPast:
1861
+ ) -> Union[tuple, Qwen2_5OmniThinkerCausalLMOutputWithPast]:
1951
1862
  r"""
1952
1863
  image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1953
1864
  The temporal, height and width of feature shape of each image in LLM.
@@ -2020,14 +1931,13 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
2020
1931
  input_features,
2021
1932
  feature_attention_mask=feature_attention_mask,
2022
1933
  audio_feature_lengths=audio_feature_lengths,
2023
- return_dict=True,
2024
- ).last_hidden_state
1934
+ )
2025
1935
  audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
2026
1936
  _, _, audio_mask = self.get_placeholder_mask(input_ids, inputs_embeds=inputs_embeds)
2027
1937
  inputs_embeds = inputs_embeds.masked_scatter(audio_mask, audio_features)
2028
1938
 
2029
1939
  if pixel_values is not None:
2030
- image_embeds = self.get_image_features(pixel_values, image_grid_thw, return_dict=True).pooler_output
1940
+ image_embeds = self.get_image_features(pixel_values, image_grid_thw)
2031
1941
  image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
2032
1942
  image_mask, _, _ = self.get_placeholder_mask(
2033
1943
  input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
@@ -2035,7 +1945,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
2035
1945
  inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
2036
1946
 
2037
1947
  if pixel_values_videos is not None:
2038
- video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw, return_dict=True).pooler_output
1948
+ video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
2039
1949
  video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
2040
1950
  _, video_mask, _ = self.get_placeholder_mask(
2041
1951
  input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
@@ -2048,8 +1958,11 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
2048
1958
  audio_feature_lengths = None
2049
1959
 
2050
1960
  if attention_mask is not None and position_ids is None:
2051
- past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
2052
- if past_key_values_length == 0 or self.rope_deltas is None:
1961
+ if (
1962
+ cache_position is None
1963
+ or (cache_position is not None and cache_position[0] == 0)
1964
+ or self.rope_deltas is None
1965
+ ):
2053
1966
  delta0 = (1 - attention_mask).sum(dim=-1).unsqueeze(1)
2054
1967
  position_ids, rope_deltas = self.get_rope_index(
2055
1968
  input_ids,
@@ -2064,7 +1977,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
2064
1977
  self.rope_deltas = rope_deltas
2065
1978
  else:
2066
1979
  batch_size, seq_length = input_ids.shape
2067
- delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
1980
+ delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0
2068
1981
  position_ids = torch.arange(seq_length, device=input_ids.device)
2069
1982
  position_ids = position_ids.view(1, -1).expand(batch_size, -1)
2070
1983
  position_ids = position_ids.add(delta)
@@ -2122,7 +2035,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
2122
2035
  feature_attention_mask=None,
2123
2036
  use_audio_in_video=False,
2124
2037
  video_second_per_grid=None,
2125
- is_first_iteration=False,
2126
2038
  **kwargs,
2127
2039
  ):
2128
2040
  model_inputs = super().prepare_inputs_for_generation(
@@ -2141,13 +2053,12 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
2141
2053
  feature_attention_mask=feature_attention_mask,
2142
2054
  use_audio_in_video=use_audio_in_video,
2143
2055
  video_second_per_grid=video_second_per_grid,
2144
- is_first_iteration=is_first_iteration,
2145
2056
  **kwargs,
2146
2057
  )
2147
2058
 
2148
2059
  model_inputs["position_ids"] = None
2149
2060
 
2150
- if not is_first_iteration and use_cache:
2061
+ if cache_position[0] != 0:
2151
2062
  model_inputs["pixel_values"] = None
2152
2063
  model_inputs["pixel_values_videos"] = None
2153
2064
  model_inputs["input_features"] = None
@@ -2184,13 +2095,13 @@ class Qwen2_5OmniTalkerCausalLMOutputWithPast(ModelOutput):
2184
2095
  response that the talker model will use to generate speech tokens.
2185
2096
  """
2186
2097
 
2187
- loss: torch.FloatTensor | None = None
2188
- logits: torch.FloatTensor | None = None
2189
- past_key_values: Cache | None = None
2190
- hidden_states: tuple[torch.FloatTensor] | None = None
2191
- attentions: tuple[torch.FloatTensor] | None = None
2192
- rope_deltas: torch.LongTensor | None = None
2193
- thinker_reply_part: torch.FloatTensor | None = None
2098
+ loss: Optional[torch.FloatTensor] = None
2099
+ logits: Optional[torch.FloatTensor] = None
2100
+ past_key_values: Optional[Cache] = None
2101
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
2102
+ attentions: Optional[tuple[torch.FloatTensor]] = None
2103
+ rope_deltas: Optional[torch.LongTensor] = None
2104
+ thinker_reply_part: Optional[torch.FloatTensor] = None
2194
2105
 
2195
2106
 
2196
2107
  @auto_docstring
@@ -2220,18 +2131,18 @@ class Qwen2_5OmniTalkerModel(Qwen2_5OmniPreTrainedModel):
2220
2131
  @auto_docstring
2221
2132
  def forward(
2222
2133
  self,
2223
- input_ids: torch.LongTensor | None = None,
2224
- attention_mask: torch.Tensor | None = None,
2225
- position_ids: torch.LongTensor | None = None,
2226
- past_key_values: Cache | None = None,
2227
- inputs_embeds: torch.FloatTensor | None = None,
2228
- use_cache: bool | None = None,
2229
- output_attentions: bool | None = None,
2230
- output_hidden_states: bool | None = None,
2231
- return_dict: bool | None = None,
2232
- cache_position: torch.LongTensor | None = None,
2134
+ input_ids: Optional[torch.LongTensor] = None,
2135
+ attention_mask: Optional[torch.Tensor] = None,
2136
+ position_ids: Optional[torch.LongTensor] = None,
2137
+ past_key_values: Optional[Cache] = None,
2138
+ inputs_embeds: Optional[torch.FloatTensor] = None,
2139
+ use_cache: Optional[bool] = None,
2140
+ output_attentions: Optional[bool] = None,
2141
+ output_hidden_states: Optional[bool] = None,
2142
+ return_dict: Optional[bool] = None,
2143
+ cache_position: Optional[torch.LongTensor] = None,
2233
2144
  **kwargs: Unpack[FlashAttentionKwargs],
2234
- ) -> tuple | BaseModelOutputWithPast:
2145
+ ) -> Union[tuple, BaseModelOutputWithPast]:
2235
2146
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
2236
2147
  output_hidden_states = (
2237
2148
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -2388,26 +2299,25 @@ class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCon
2388
2299
  @auto_docstring
2389
2300
  def forward(
2390
2301
  self,
2391
- input_ids: torch.LongTensor | None = None,
2392
- attention_mask: torch.Tensor | None = None,
2393
- position_ids: torch.LongTensor | None = None,
2394
- past_key_values: Cache | None = None,
2395
- thinker_reply_part: torch.FloatTensor | None = None,
2396
- inputs_embeds: torch.FloatTensor | None = None,
2397
- rope_deltas: torch.LongTensor | None = None,
2398
- use_cache: bool | None = None,
2399
- cache_position: torch.LongTensor | None = None,
2400
- input_text_ids: torch.LongTensor | None = None,
2401
- image_grid_thw: torch.LongTensor | None = None,
2402
- video_grid_thw: torch.LongTensor | None = None,
2403
- use_audio_in_video: bool | None = None,
2404
- audio_feature_lengths: torch.LongTensor | None = None,
2405
- video_second_per_grid: torch.LongTensor | None = None,
2406
- output_attentions: bool | None = None,
2407
- output_hidden_states: bool | None = None,
2408
- return_dict: bool | None = None,
2409
- **kwargs,
2410
- ) -> tuple | Qwen2_5OmniTalkerCausalLMOutputWithPast:
2302
+ input_ids: Optional[torch.LongTensor] = None,
2303
+ attention_mask: Optional[torch.Tensor] = None,
2304
+ position_ids: Optional[torch.LongTensor] = None,
2305
+ past_key_values: Optional[Cache] = None,
2306
+ thinker_reply_part: Optional[torch.FloatTensor] = None,
2307
+ inputs_embeds: Optional[torch.FloatTensor] = None,
2308
+ rope_deltas: Optional[torch.LongTensor] = None,
2309
+ use_cache: Optional[bool] = None,
2310
+ cache_position: Optional[torch.LongTensor] = None,
2311
+ input_text_ids: Optional[torch.LongTensor] = None,
2312
+ image_grid_thw: Optional[torch.LongTensor] = None,
2313
+ video_grid_thw: Optional[torch.LongTensor] = None,
2314
+ use_audio_in_video: Optional[bool] = None,
2315
+ audio_feature_lengths: Optional[torch.LongTensor] = None,
2316
+ video_second_per_grid: Optional[torch.LongTensor] = None,
2317
+ output_attentions: Optional[bool] = None,
2318
+ output_hidden_states: Optional[bool] = None,
2319
+ return_dict: Optional[bool] = None,
2320
+ ) -> Union[tuple, Qwen2_5OmniTalkerCausalLMOutputWithPast]:
2411
2321
  r"""
2412
2322
  thinker_reply_part (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
2413
2323
  Hidden states from the thinker model's output that represent the text reply part to be processed.
@@ -2456,8 +2366,11 @@ class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCon
2456
2366
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2457
2367
 
2458
2368
  if attention_mask is not None and position_ids is None:
2459
- past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
2460
- if past_key_values_length == 0 or self.rope_deltas is None:
2369
+ if (
2370
+ cache_position is None
2371
+ or (cache_position is not None and cache_position[0] == 0)
2372
+ or self.rope_deltas is None
2373
+ ):
2461
2374
  position_ids, rope_deltas = self.get_rope_index(
2462
2375
  input_text_ids,
2463
2376
  image_grid_thw,
@@ -2477,12 +2390,8 @@ class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCon
2477
2390
  self.rope_deltas = rope_deltas
2478
2391
 
2479
2392
  else:
2480
- if inputs_embeds is not None:
2481
- batch_size, seq_length, _ = inputs_embeds.shape
2482
- else:
2483
- batch_size, seq_length = input_ids.shape
2484
-
2485
- delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
2393
+ batch_size, seq_length = input_ids.shape
2394
+ delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0
2486
2395
  position_ids = torch.arange(seq_length, device=input_ids.device)
2487
2396
  position_ids = position_ids.view(1, -1).expand(batch_size, -1)
2488
2397
  position_ids = position_ids.add(delta)
@@ -2616,13 +2525,13 @@ class Qwen2_5OmniDiTRotaryEmbedding(nn.Module):
2616
2525
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
2617
2526
 
2618
2527
  self.register_buffer("inv_freq", inv_freq, persistent=False)
2619
- self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
2528
+ self.original_inv_freq = inv_freq
2620
2529
 
2621
2530
  @staticmethod
2622
2531
  def compute_default_rope_parameters(
2623
- config: Qwen2_5OmniDiTConfig | None = None,
2532
+ config: Optional[Qwen2_5OmniDiTConfig] = None,
2624
2533
  device: Optional["torch.device"] = None,
2625
- seq_len: int | None = None,
2534
+ seq_len: Optional[int] = None,
2626
2535
  ) -> tuple["torch.Tensor", float]:
2627
2536
  """
2628
2537
  Computes the inverse frequencies according to the original RoPE implementation
@@ -2655,7 +2564,7 @@ class Qwen2_5OmniDiTRotaryEmbedding(nn.Module):
2655
2564
  position_ids_expanded = position_ids[:, None, :].float()
2656
2565
 
2657
2566
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
2658
- with maybe_autocast(device_type=device_type, enabled=False): # Force float32
2567
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
2659
2568
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
2660
2569
  emb = torch.cat((freqs, freqs), dim=-1)
2661
2570
  cos = emb.cos() * self.attention_scaling
@@ -2981,9 +2890,9 @@ class DiTInputEmbedding(nn.Module):
2981
2890
  speaker_embedding: torch.Tensor,
2982
2891
  condition_vector: torch.Tensor,
2983
2892
  code_embed: torch.Tensor,
2984
- drop_audio_cond: bool | None = False,
2985
- code_embed_uncond: bool | None = None,
2986
- apply_cfg: bool | None = True,
2893
+ drop_audio_cond: Optional[bool] = False,
2894
+ code_embed_uncond: Optional[bool] = None,
2895
+ apply_cfg: Optional[bool] = True,
2987
2896
  ):
2988
2897
  if apply_cfg:
2989
2898
  hidden_states = torch.cat([hidden_states, hidden_states], dim=0)
@@ -3075,7 +2984,7 @@ class DiTMLP(nn.Module):
3075
2984
 
3076
2985
 
3077
2986
  # Modified from Llama with a different rotate function, will fixed in next release
3078
- def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
2987
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
3079
2988
  """Applies Rotary Position Embedding to the query and key tensors.
3080
2989
 
3081
2990
  Args:
@@ -3083,6 +2992,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
3083
2992
  k (`torch.Tensor`): The key tensor.
3084
2993
  cos (`torch.Tensor`): The cosine part of the rotary embedding.
3085
2994
  sin (`torch.Tensor`): The sine part of the rotary embedding.
2995
+ position_ids (`torch.Tensor`, *optional*):
2996
+ Deprecated and unused.
3086
2997
  unsqueeze_dim (`int`, *optional*, defaults to 1):
3087
2998
  The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
3088
2999
  sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
@@ -3277,6 +3188,52 @@ class SnakeBeta(nn.Module):
3277
3188
  return hidden_states
3278
3189
 
3279
3190
 
3191
+ def kaiser_sinc_filter1d(cutoff, half_width, kernel_size):
3192
+ """Generates a 1D Kaiser-windowed sinc filter.
3193
+
3194
+ Args:
3195
+ cutoff (float): Normalized cutoff frequency (0 to 0.5).
3196
+ half_width (float): Transition bandwidth.
3197
+ kernel_size (int): Number of filter taps.
3198
+
3199
+ Returns:
3200
+ torch.Tensor: A tensor of shape (1, 1, kernel_size) representing the filter.
3201
+ """
3202
+ is_even = kernel_size % 2 == 0
3203
+ half_size = kernel_size // 2
3204
+
3205
+ # Compute Kaiser window parameters
3206
+ delta_f = 4 * half_width
3207
+ attenuation = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
3208
+
3209
+ if attenuation > 50.0:
3210
+ beta = 0.1102 * (attenuation - 8.7)
3211
+ elif attenuation >= 21.0:
3212
+ beta = 0.5842 * (attenuation - 21) ** 0.4 + 0.07886 * (attenuation - 21.0)
3213
+ else:
3214
+ beta = 0.0
3215
+
3216
+ kaiser_window = torch.kaiser_window(kernel_size, beta=beta, periodic=False, dtype=torch.float32)
3217
+
3218
+ # Compute time indices
3219
+ if is_even:
3220
+ time_indices = torch.arange(-half_size, half_size) + 0.5
3221
+ else:
3222
+ time_indices = torch.arange(kernel_size) - half_size
3223
+
3224
+ # Compute sinc filter
3225
+ if cutoff == 0:
3226
+ return torch.zeros((1, 1, kernel_size), dtype=torch.float32) # Ensures correct shape
3227
+
3228
+ sinc_filter = torch.sinc(2 * cutoff * time_indices)
3229
+ normalized_filter = 2 * cutoff * kaiser_window * sinc_filter
3230
+
3231
+ # Normalize to ensure sum = 1 (avoid leakage of constant component)
3232
+ normalized_filter /= normalized_filter.sum()
3233
+
3234
+ return normalized_filter.view(1, 1, kernel_size)
3235
+
3236
+
3280
3237
  class UpSample1d(nn.Module):
3281
3238
  def __init__(self, ratio=2, kernel_size=None):
3282
3239
  super().__init__()
@@ -3307,9 +3264,6 @@ class DownSample1d(nn.Module):
3307
3264
  super().__init__()
3308
3265
  cutoff = 0.5 / ratio
3309
3266
  half_width = 0.6 / ratio
3310
- self.cutoff = cutoff
3311
- self.half_width = half_width
3312
- self.kernel_size = kernel_size
3313
3267
 
3314
3268
  if cutoff < 0.0:
3315
3269
  raise ValueError("Minimum cutoff must be larger than zero.")
@@ -3491,8 +3445,6 @@ class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel):
3491
3445
  config.upsample_initial_channel // (2**self.num_upsample_layers), 1, 7, 1, padding=3, bias=False
3492
3446
  )
3493
3447
 
3494
- self.post_init()
3495
-
3496
3448
  def normalize_spectrogram(self, spectrogram, max_value, min_db):
3497
3449
  return torch.clamp((2 * max_value) * ((spectrogram - min_db) / (-min_db)) - max_value, -max_value, max_value)
3498
3450
 
@@ -3507,7 +3459,7 @@ class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel):
3507
3459
  decibel_spectrum = self.amplitude_to_db(amplitude_spectrum, -115) - 20
3508
3460
  return self.normalize_spectrogram(decibel_spectrum, 1, -115)
3509
3461
 
3510
- def forward(self, mel_spectrogram, **kwargs):
3462
+ def forward(self, mel_spectrogram):
3511
3463
  processed_spectrogram = self.process_mel_spectrogram(mel_spectrogram)
3512
3464
  hidden_representation = self.conv_pre(processed_spectrogram)
3513
3465
 
@@ -3620,8 +3572,6 @@ class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel):
3620
3572
  self.norm_out = Qwen2_5_OmniAdaLayerNormZero_Final(config.hidden_size) # final modulation
3621
3573
  self.proj_out = nn.Linear(config.hidden_size, config.mel_dim)
3622
3574
 
3623
- self.post_init()
3624
-
3625
3575
  def _create_block_diff(self, hidden_states):
3626
3576
  batch, seq_len = hidden_states.shape[0], hidden_states.shape[1]
3627
3577
  block_indices = torch.arange(seq_len, device=hidden_states.device) // self.block_size # [seq_length]
@@ -3642,7 +3592,6 @@ class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel):
3642
3592
  drop_audio_conditioning=False,
3643
3593
  drop_code=False,
3644
3594
  apply_cfg=True,
3645
- **kwargs,
3646
3595
  ):
3647
3596
  batch_size = hidden_states.shape[0]
3648
3597
  if time_step.ndim == 0:
@@ -3693,24 +3642,15 @@ class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel):
3693
3642
  guidance_scale=0.5,
3694
3643
  sway_coefficient=-1.0,
3695
3644
  ):
3645
+ noise_initialization = torch.randn([1, 30000, self.mel_dim], dtype=reference_mel_spectrogram.dtype)
3696
3646
  maximum_duration = quantized_code.shape[1] * self.repeats
3647
+ initial_state = noise_initialization[:, :maximum_duration].to(quantized_code.device)
3697
3648
  batch_size = reference_mel_spectrogram.shape[0]
3649
+ conditioning_vector = conditioning_vector.unsqueeze(1).repeat(1, maximum_duration, 1)
3650
+
3698
3651
  if batch_size != 1:
3699
3652
  raise ValueError("Only batch size = 1 is currently supported")
3700
3653
 
3701
- if maximum_duration > self.config.max_position_embeddings:
3702
- raise ValueError(
3703
- f"Requested mel length ({maximum_duration}) exceeds `dit_config.max_position_embeddings` "
3704
- f"({self.config.max_position_embeddings}). Provide shorter `quantized_code`."
3705
- )
3706
-
3707
- initial_state = torch.randn(
3708
- [batch_size, maximum_duration, self.mel_dim],
3709
- dtype=reference_mel_spectrogram.dtype,
3710
- device=quantized_code.device,
3711
- )
3712
- conditioning_vector = conditioning_vector.unsqueeze(1).repeat(1, maximum_duration, 1)
3713
-
3714
3654
  def ode_function(time_step, hidden_states):
3715
3655
  if guidance_scale < 1e-5:
3716
3656
  prediction = self(
@@ -3721,7 +3661,6 @@ class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel):
3721
3661
  time_step=time_step,
3722
3662
  drop_audio_conditioning=False,
3723
3663
  drop_code=False,
3724
- apply_cfg=False,
3725
3664
  )
3726
3665
  return prediction
3727
3666
 
@@ -3766,9 +3705,9 @@ class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel):
3766
3705
  def __init__(self, config: Qwen2_5OmniToken2WavConfig):
3767
3706
  super().__init__(config)
3768
3707
  attn_impl = config._attn_implementation
3769
- if is_flash_attention_requested(config):
3708
+ if config._attn_implementation == "flash_attention_2":
3770
3709
  logger.warning_once(
3771
- "Qwen2_5OmniToken2WavModel must inference with fp32, but Flash Attention only supports fp16 and bf16, "
3710
+ "Qwen2_5OmniToken2WavModel must inference with fp32, but flash_attention_2 only supports fp16 and bf16, "
3772
3711
  "attention implementation of Qwen2_5OmniToken2WavModel will fallback to sdpa."
3773
3712
  )
3774
3713
  attn_impl = "sdpa"
@@ -3784,8 +3723,6 @@ class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel):
3784
3723
  config.bigvgan_config, attn_implementation=attn_impl
3785
3724
  )
3786
3725
 
3787
- self.post_init()
3788
-
3789
3726
  def forward(
3790
3727
  self,
3791
3728
  code,
@@ -3918,7 +3855,7 @@ class Qwen2_5OmniForConditionalGeneration(Qwen2_5OmniPreTrainedModel, Generation
3918
3855
  # TODO: raushan, defaults should be saved in generation config
3919
3856
  def generate(
3920
3857
  self,
3921
- input_ids: torch.Tensor | None = None,
3858
+ input_ids: Optional[torch.Tensor] = None,
3922
3859
  speaker: str = "Chelsie",
3923
3860
  use_audio_in_video: bool = False,
3924
3861
  thinker_max_new_tokens: int = 1024,
@@ -3954,7 +3891,7 @@ class Qwen2_5OmniForConditionalGeneration(Qwen2_5OmniPreTrainedModel, Generation
3954
3891
  - **Text** (`torch.Tensor`): Generated text token sequence.
3955
3892
  - **Audio waveform** (`torch.Tensor`): Generated audio waveform.
3956
3893
  """
3957
- # check `False` on purpose because the parameter can be `str/bool`. This is needed for BC
3894
+ # check `False` on purpose because the paramter can be `str/bool`. This is needed for BC
3958
3895
  generation_mode = kwargs.pop("generation_mode", None)
3959
3896
  return_audio = generation_mode != "text" and generation_mode is not False
3960
3897