transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1021) hide show
  1. transformers/__init__.py +4 -11
  2. transformers/activations.py +2 -2
  3. transformers/backbone_utils.py +326 -0
  4. transformers/cache_utils.py +11 -2
  5. transformers/cli/serve.py +11 -8
  6. transformers/configuration_utils.py +1 -69
  7. transformers/conversion_mapping.py +146 -26
  8. transformers/convert_slow_tokenizer.py +6 -4
  9. transformers/core_model_loading.py +207 -118
  10. transformers/dependency_versions_check.py +0 -1
  11. transformers/dependency_versions_table.py +7 -8
  12. transformers/file_utils.py +0 -2
  13. transformers/generation/candidate_generator.py +1 -2
  14. transformers/generation/continuous_batching/cache.py +40 -38
  15. transformers/generation/continuous_batching/cache_manager.py +3 -16
  16. transformers/generation/continuous_batching/continuous_api.py +94 -406
  17. transformers/generation/continuous_batching/input_ouputs.py +464 -0
  18. transformers/generation/continuous_batching/requests.py +54 -17
  19. transformers/generation/continuous_batching/scheduler.py +77 -95
  20. transformers/generation/logits_process.py +10 -5
  21. transformers/generation/stopping_criteria.py +1 -2
  22. transformers/generation/utils.py +75 -95
  23. transformers/image_processing_utils.py +0 -3
  24. transformers/image_processing_utils_fast.py +17 -18
  25. transformers/image_transforms.py +44 -13
  26. transformers/image_utils.py +0 -5
  27. transformers/initialization.py +57 -0
  28. transformers/integrations/__init__.py +10 -24
  29. transformers/integrations/accelerate.py +47 -11
  30. transformers/integrations/deepspeed.py +145 -3
  31. transformers/integrations/executorch.py +2 -6
  32. transformers/integrations/finegrained_fp8.py +142 -7
  33. transformers/integrations/flash_attention.py +2 -7
  34. transformers/integrations/hub_kernels.py +18 -7
  35. transformers/integrations/moe.py +226 -106
  36. transformers/integrations/mxfp4.py +47 -34
  37. transformers/integrations/peft.py +488 -176
  38. transformers/integrations/tensor_parallel.py +641 -581
  39. transformers/masking_utils.py +153 -9
  40. transformers/modeling_flash_attention_utils.py +1 -2
  41. transformers/modeling_utils.py +359 -358
  42. transformers/models/__init__.py +6 -0
  43. transformers/models/afmoe/configuration_afmoe.py +14 -4
  44. transformers/models/afmoe/modeling_afmoe.py +8 -8
  45. transformers/models/afmoe/modular_afmoe.py +7 -7
  46. transformers/models/aimv2/configuration_aimv2.py +2 -7
  47. transformers/models/aimv2/modeling_aimv2.py +26 -24
  48. transformers/models/aimv2/modular_aimv2.py +8 -12
  49. transformers/models/albert/configuration_albert.py +8 -1
  50. transformers/models/albert/modeling_albert.py +3 -3
  51. transformers/models/align/configuration_align.py +8 -5
  52. transformers/models/align/modeling_align.py +22 -24
  53. transformers/models/altclip/configuration_altclip.py +4 -6
  54. transformers/models/altclip/modeling_altclip.py +30 -26
  55. transformers/models/apertus/configuration_apertus.py +5 -7
  56. transformers/models/apertus/modeling_apertus.py +4 -4
  57. transformers/models/apertus/modular_apertus.py +8 -10
  58. transformers/models/arcee/configuration_arcee.py +5 -7
  59. transformers/models/arcee/modeling_arcee.py +4 -4
  60. transformers/models/aria/configuration_aria.py +11 -21
  61. transformers/models/aria/modeling_aria.py +39 -36
  62. transformers/models/aria/modular_aria.py +33 -39
  63. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
  64. transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
  65. transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
  66. transformers/models/auto/auto_factory.py +8 -6
  67. transformers/models/auto/configuration_auto.py +22 -0
  68. transformers/models/auto/image_processing_auto.py +17 -13
  69. transformers/models/auto/modeling_auto.py +15 -0
  70. transformers/models/auto/processing_auto.py +9 -18
  71. transformers/models/auto/tokenization_auto.py +17 -15
  72. transformers/models/autoformer/modeling_autoformer.py +2 -1
  73. transformers/models/aya_vision/configuration_aya_vision.py +4 -0
  74. transformers/models/aya_vision/modeling_aya_vision.py +29 -62
  75. transformers/models/aya_vision/modular_aya_vision.py +20 -45
  76. transformers/models/bamba/configuration_bamba.py +17 -7
  77. transformers/models/bamba/modeling_bamba.py +23 -55
  78. transformers/models/bamba/modular_bamba.py +19 -54
  79. transformers/models/bark/configuration_bark.py +2 -1
  80. transformers/models/bark/modeling_bark.py +24 -10
  81. transformers/models/bart/configuration_bart.py +9 -4
  82. transformers/models/bart/modeling_bart.py +9 -12
  83. transformers/models/beit/configuration_beit.py +2 -4
  84. transformers/models/beit/image_processing_beit_fast.py +3 -3
  85. transformers/models/beit/modeling_beit.py +14 -9
  86. transformers/models/bert/configuration_bert.py +12 -1
  87. transformers/models/bert/modeling_bert.py +6 -30
  88. transformers/models/bert_generation/configuration_bert_generation.py +17 -1
  89. transformers/models/bert_generation/modeling_bert_generation.py +6 -6
  90. transformers/models/big_bird/configuration_big_bird.py +12 -8
  91. transformers/models/big_bird/modeling_big_bird.py +0 -15
  92. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
  93. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
  94. transformers/models/biogpt/configuration_biogpt.py +8 -1
  95. transformers/models/biogpt/modeling_biogpt.py +4 -8
  96. transformers/models/biogpt/modular_biogpt.py +1 -5
  97. transformers/models/bit/configuration_bit.py +2 -4
  98. transformers/models/bit/modeling_bit.py +6 -5
  99. transformers/models/bitnet/configuration_bitnet.py +5 -7
  100. transformers/models/bitnet/modeling_bitnet.py +3 -4
  101. transformers/models/bitnet/modular_bitnet.py +3 -4
  102. transformers/models/blenderbot/configuration_blenderbot.py +8 -4
  103. transformers/models/blenderbot/modeling_blenderbot.py +4 -4
  104. transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
  105. transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
  106. transformers/models/blip/configuration_blip.py +9 -9
  107. transformers/models/blip/modeling_blip.py +55 -37
  108. transformers/models/blip_2/configuration_blip_2.py +2 -1
  109. transformers/models/blip_2/modeling_blip_2.py +81 -56
  110. transformers/models/bloom/configuration_bloom.py +5 -1
  111. transformers/models/bloom/modeling_bloom.py +2 -1
  112. transformers/models/blt/configuration_blt.py +23 -12
  113. transformers/models/blt/modeling_blt.py +20 -14
  114. transformers/models/blt/modular_blt.py +70 -10
  115. transformers/models/bridgetower/configuration_bridgetower.py +7 -1
  116. transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
  117. transformers/models/bridgetower/modeling_bridgetower.py +29 -15
  118. transformers/models/bros/configuration_bros.py +24 -17
  119. transformers/models/camembert/configuration_camembert.py +8 -1
  120. transformers/models/camembert/modeling_camembert.py +6 -6
  121. transformers/models/canine/configuration_canine.py +4 -1
  122. transformers/models/chameleon/configuration_chameleon.py +5 -7
  123. transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
  124. transformers/models/chameleon/modeling_chameleon.py +82 -36
  125. transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
  126. transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
  127. transformers/models/clap/configuration_clap.py +4 -8
  128. transformers/models/clap/modeling_clap.py +21 -22
  129. transformers/models/clip/configuration_clip.py +4 -1
  130. transformers/models/clip/image_processing_clip_fast.py +9 -0
  131. transformers/models/clip/modeling_clip.py +25 -22
  132. transformers/models/clipseg/configuration_clipseg.py +4 -1
  133. transformers/models/clipseg/modeling_clipseg.py +27 -25
  134. transformers/models/clipseg/processing_clipseg.py +11 -3
  135. transformers/models/clvp/configuration_clvp.py +14 -2
  136. transformers/models/clvp/modeling_clvp.py +19 -30
  137. transformers/models/codegen/configuration_codegen.py +4 -3
  138. transformers/models/codegen/modeling_codegen.py +2 -1
  139. transformers/models/cohere/configuration_cohere.py +5 -7
  140. transformers/models/cohere/modeling_cohere.py +4 -4
  141. transformers/models/cohere/modular_cohere.py +3 -3
  142. transformers/models/cohere2/configuration_cohere2.py +6 -8
  143. transformers/models/cohere2/modeling_cohere2.py +4 -4
  144. transformers/models/cohere2/modular_cohere2.py +9 -11
  145. transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
  146. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
  147. transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
  148. transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
  149. transformers/models/colqwen2/modeling_colqwen2.py +7 -6
  150. transformers/models/colqwen2/modular_colqwen2.py +7 -6
  151. transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
  152. transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
  153. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
  154. transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
  155. transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
  156. transformers/models/convbert/configuration_convbert.py +11 -7
  157. transformers/models/convnext/configuration_convnext.py +2 -4
  158. transformers/models/convnext/image_processing_convnext_fast.py +2 -2
  159. transformers/models/convnext/modeling_convnext.py +7 -6
  160. transformers/models/convnextv2/configuration_convnextv2.py +2 -4
  161. transformers/models/convnextv2/modeling_convnextv2.py +7 -6
  162. transformers/models/cpmant/configuration_cpmant.py +4 -0
  163. transformers/models/csm/configuration_csm.py +9 -15
  164. transformers/models/csm/modeling_csm.py +3 -3
  165. transformers/models/ctrl/configuration_ctrl.py +16 -0
  166. transformers/models/ctrl/modeling_ctrl.py +13 -25
  167. transformers/models/cwm/configuration_cwm.py +5 -7
  168. transformers/models/cwm/modeling_cwm.py +4 -4
  169. transformers/models/d_fine/configuration_d_fine.py +10 -56
  170. transformers/models/d_fine/modeling_d_fine.py +728 -868
  171. transformers/models/d_fine/modular_d_fine.py +335 -412
  172. transformers/models/dab_detr/configuration_dab_detr.py +22 -48
  173. transformers/models/dab_detr/modeling_dab_detr.py +11 -7
  174. transformers/models/dac/modeling_dac.py +1 -1
  175. transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
  176. transformers/models/data2vec/configuration_data2vec_text.py +11 -2
  177. transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
  178. transformers/models/data2vec/modeling_data2vec_text.py +6 -6
  179. transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
  180. transformers/models/dbrx/configuration_dbrx.py +11 -3
  181. transformers/models/dbrx/modeling_dbrx.py +6 -6
  182. transformers/models/dbrx/modular_dbrx.py +6 -6
  183. transformers/models/deberta/configuration_deberta.py +6 -0
  184. transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
  185. transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
  186. transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
  187. transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
  188. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
  189. transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
  190. transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
  191. transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
  192. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
  193. transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
  194. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
  195. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
  196. transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
  197. transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
  198. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
  199. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
  200. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
  201. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
  202. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
  203. transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
  204. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
  205. transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
  206. transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
  207. transformers/models/deit/modeling_deit.py +11 -7
  208. transformers/models/depth_anything/configuration_depth_anything.py +12 -42
  209. transformers/models/depth_anything/modeling_depth_anything.py +5 -3
  210. transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
  211. transformers/models/depth_pro/modeling_depth_pro.py +8 -4
  212. transformers/models/detr/configuration_detr.py +18 -49
  213. transformers/models/detr/image_processing_detr_fast.py +11 -11
  214. transformers/models/detr/modeling_detr.py +695 -734
  215. transformers/models/dia/configuration_dia.py +4 -7
  216. transformers/models/dia/generation_dia.py +8 -17
  217. transformers/models/dia/modeling_dia.py +7 -7
  218. transformers/models/dia/modular_dia.py +4 -4
  219. transformers/models/diffllama/configuration_diffllama.py +5 -7
  220. transformers/models/diffllama/modeling_diffllama.py +3 -8
  221. transformers/models/diffllama/modular_diffllama.py +2 -7
  222. transformers/models/dinat/configuration_dinat.py +2 -4
  223. transformers/models/dinat/modeling_dinat.py +7 -6
  224. transformers/models/dinov2/configuration_dinov2.py +2 -4
  225. transformers/models/dinov2/modeling_dinov2.py +9 -8
  226. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
  227. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
  228. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
  229. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
  230. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
  231. transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
  232. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
  233. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
  234. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
  235. transformers/models/distilbert/configuration_distilbert.py +8 -1
  236. transformers/models/distilbert/modeling_distilbert.py +3 -3
  237. transformers/models/doge/configuration_doge.py +17 -7
  238. transformers/models/doge/modeling_doge.py +4 -4
  239. transformers/models/doge/modular_doge.py +20 -10
  240. transformers/models/donut/image_processing_donut_fast.py +4 -4
  241. transformers/models/dots1/configuration_dots1.py +16 -7
  242. transformers/models/dots1/modeling_dots1.py +4 -4
  243. transformers/models/dpr/configuration_dpr.py +19 -1
  244. transformers/models/dpt/configuration_dpt.py +23 -65
  245. transformers/models/dpt/image_processing_dpt_fast.py +5 -5
  246. transformers/models/dpt/modeling_dpt.py +19 -15
  247. transformers/models/dpt/modular_dpt.py +4 -4
  248. transformers/models/edgetam/configuration_edgetam.py +1 -1
  249. transformers/models/edgetam/modeling_edgetam.py +53 -53
  250. transformers/models/edgetam/modular_edgetam.py +5 -7
  251. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
  252. transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
  253. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
  254. transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
  255. transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
  256. transformers/models/electra/configuration_electra.py +13 -2
  257. transformers/models/electra/modeling_electra.py +6 -6
  258. transformers/models/emu3/configuration_emu3.py +12 -10
  259. transformers/models/emu3/modeling_emu3.py +84 -47
  260. transformers/models/emu3/modular_emu3.py +77 -39
  261. transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
  262. transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
  263. transformers/models/eomt/configuration_eomt.py +12 -13
  264. transformers/models/eomt/image_processing_eomt_fast.py +3 -3
  265. transformers/models/eomt/modeling_eomt.py +3 -3
  266. transformers/models/eomt/modular_eomt.py +17 -17
  267. transformers/models/eomt_dinov3/__init__.py +28 -0
  268. transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
  269. transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
  270. transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
  271. transformers/models/ernie/configuration_ernie.py +24 -2
  272. transformers/models/ernie/modeling_ernie.py +6 -30
  273. transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
  274. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  275. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
  276. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
  277. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
  278. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
  279. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
  280. transformers/models/esm/configuration_esm.py +9 -11
  281. transformers/models/esm/modeling_esm.py +3 -3
  282. transformers/models/esm/modeling_esmfold.py +1 -6
  283. transformers/models/esm/openfold_utils/protein.py +2 -3
  284. transformers/models/evolla/configuration_evolla.py +21 -8
  285. transformers/models/evolla/modeling_evolla.py +11 -7
  286. transformers/models/evolla/modular_evolla.py +5 -1
  287. transformers/models/exaone4/configuration_exaone4.py +8 -5
  288. transformers/models/exaone4/modeling_exaone4.py +4 -4
  289. transformers/models/exaone4/modular_exaone4.py +11 -8
  290. transformers/models/exaone_moe/__init__.py +27 -0
  291. transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
  292. transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
  293. transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
  294. transformers/models/falcon/configuration_falcon.py +9 -1
  295. transformers/models/falcon/modeling_falcon.py +3 -8
  296. transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
  297. transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
  298. transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
  299. transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
  300. transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
  301. transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
  302. transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
  303. transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
  304. transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
  305. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
  306. transformers/models/flaubert/configuration_flaubert.py +10 -4
  307. transformers/models/flaubert/modeling_flaubert.py +1 -1
  308. transformers/models/flava/configuration_flava.py +4 -3
  309. transformers/models/flava/image_processing_flava_fast.py +4 -4
  310. transformers/models/flava/modeling_flava.py +36 -28
  311. transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
  312. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
  313. transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
  314. transformers/models/florence2/configuration_florence2.py +4 -0
  315. transformers/models/florence2/modeling_florence2.py +57 -32
  316. transformers/models/florence2/modular_florence2.py +48 -26
  317. transformers/models/fnet/configuration_fnet.py +6 -1
  318. transformers/models/focalnet/configuration_focalnet.py +2 -4
  319. transformers/models/focalnet/modeling_focalnet.py +10 -7
  320. transformers/models/fsmt/configuration_fsmt.py +12 -16
  321. transformers/models/funnel/configuration_funnel.py +8 -0
  322. transformers/models/fuyu/configuration_fuyu.py +5 -8
  323. transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
  324. transformers/models/fuyu/modeling_fuyu.py +24 -23
  325. transformers/models/gemma/configuration_gemma.py +5 -7
  326. transformers/models/gemma/modeling_gemma.py +4 -4
  327. transformers/models/gemma/modular_gemma.py +5 -7
  328. transformers/models/gemma2/configuration_gemma2.py +5 -7
  329. transformers/models/gemma2/modeling_gemma2.py +4 -4
  330. transformers/models/gemma2/modular_gemma2.py +8 -10
  331. transformers/models/gemma3/configuration_gemma3.py +28 -22
  332. transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
  333. transformers/models/gemma3/modeling_gemma3.py +37 -33
  334. transformers/models/gemma3/modular_gemma3.py +46 -42
  335. transformers/models/gemma3n/configuration_gemma3n.py +35 -22
  336. transformers/models/gemma3n/modeling_gemma3n.py +86 -58
  337. transformers/models/gemma3n/modular_gemma3n.py +112 -75
  338. transformers/models/git/configuration_git.py +5 -7
  339. transformers/models/git/modeling_git.py +31 -41
  340. transformers/models/glm/configuration_glm.py +7 -9
  341. transformers/models/glm/modeling_glm.py +4 -4
  342. transformers/models/glm4/configuration_glm4.py +7 -9
  343. transformers/models/glm4/modeling_glm4.py +4 -4
  344. transformers/models/glm46v/configuration_glm46v.py +4 -0
  345. transformers/models/glm46v/image_processing_glm46v.py +5 -2
  346. transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
  347. transformers/models/glm46v/modeling_glm46v.py +91 -46
  348. transformers/models/glm46v/modular_glm46v.py +4 -0
  349. transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
  350. transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
  351. transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
  352. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
  353. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
  354. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
  355. transformers/models/glm4v/configuration_glm4v.py +12 -8
  356. transformers/models/glm4v/image_processing_glm4v.py +5 -2
  357. transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
  358. transformers/models/glm4v/modeling_glm4v.py +120 -63
  359. transformers/models/glm4v/modular_glm4v.py +82 -50
  360. transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
  361. transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
  362. transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
  363. transformers/models/glm_image/configuration_glm_image.py +26 -20
  364. transformers/models/glm_image/image_processing_glm_image.py +1 -1
  365. transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
  366. transformers/models/glm_image/modeling_glm_image.py +337 -236
  367. transformers/models/glm_image/modular_glm_image.py +415 -255
  368. transformers/models/glm_image/processing_glm_image.py +65 -17
  369. transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
  370. transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
  371. transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
  372. transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
  373. transformers/models/glmasr/modeling_glmasr.py +34 -28
  374. transformers/models/glmasr/modular_glmasr.py +23 -11
  375. transformers/models/glpn/image_processing_glpn_fast.py +3 -3
  376. transformers/models/glpn/modeling_glpn.py +4 -2
  377. transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
  378. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
  379. transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
  380. transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
  381. transformers/models/gpt2/configuration_gpt2.py +13 -1
  382. transformers/models/gpt2/modeling_gpt2.py +5 -5
  383. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
  384. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
  385. transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
  386. transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
  387. transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
  388. transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
  389. transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
  390. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
  391. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
  392. transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
  393. transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
  394. transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
  395. transformers/models/gptj/configuration_gptj.py +4 -4
  396. transformers/models/gptj/modeling_gptj.py +3 -7
  397. transformers/models/granite/configuration_granite.py +5 -7
  398. transformers/models/granite/modeling_granite.py +4 -4
  399. transformers/models/granite_speech/modeling_granite_speech.py +63 -37
  400. transformers/models/granitemoe/configuration_granitemoe.py +5 -7
  401. transformers/models/granitemoe/modeling_granitemoe.py +4 -4
  402. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
  403. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
  404. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
  405. transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
  406. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
  407. transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
  408. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
  409. transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
  410. transformers/models/groupvit/configuration_groupvit.py +4 -1
  411. transformers/models/groupvit/modeling_groupvit.py +29 -22
  412. transformers/models/helium/configuration_helium.py +5 -7
  413. transformers/models/helium/modeling_helium.py +4 -4
  414. transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
  415. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
  416. transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
  417. transformers/models/hiera/configuration_hiera.py +2 -4
  418. transformers/models/hiera/modeling_hiera.py +11 -8
  419. transformers/models/hubert/configuration_hubert.py +4 -1
  420. transformers/models/hubert/modeling_hubert.py +7 -4
  421. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
  422. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
  423. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
  424. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
  425. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
  426. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
  427. transformers/models/ibert/configuration_ibert.py +4 -1
  428. transformers/models/idefics/configuration_idefics.py +5 -7
  429. transformers/models/idefics/modeling_idefics.py +3 -4
  430. transformers/models/idefics/vision.py +5 -4
  431. transformers/models/idefics2/configuration_idefics2.py +1 -2
  432. transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
  433. transformers/models/idefics2/modeling_idefics2.py +72 -50
  434. transformers/models/idefics3/configuration_idefics3.py +1 -3
  435. transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
  436. transformers/models/idefics3/modeling_idefics3.py +63 -40
  437. transformers/models/ijepa/modeling_ijepa.py +3 -3
  438. transformers/models/imagegpt/configuration_imagegpt.py +9 -1
  439. transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
  440. transformers/models/imagegpt/modeling_imagegpt.py +8 -4
  441. transformers/models/informer/modeling_informer.py +3 -3
  442. transformers/models/instructblip/configuration_instructblip.py +2 -1
  443. transformers/models/instructblip/modeling_instructblip.py +65 -39
  444. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
  445. transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
  446. transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
  447. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
  448. transformers/models/internvl/configuration_internvl.py +5 -0
  449. transformers/models/internvl/modeling_internvl.py +35 -55
  450. transformers/models/internvl/modular_internvl.py +26 -38
  451. transformers/models/internvl/video_processing_internvl.py +2 -2
  452. transformers/models/jais2/configuration_jais2.py +5 -7
  453. transformers/models/jais2/modeling_jais2.py +4 -4
  454. transformers/models/jamba/configuration_jamba.py +5 -7
  455. transformers/models/jamba/modeling_jamba.py +4 -4
  456. transformers/models/jamba/modular_jamba.py +3 -3
  457. transformers/models/janus/image_processing_janus.py +2 -2
  458. transformers/models/janus/image_processing_janus_fast.py +8 -8
  459. transformers/models/janus/modeling_janus.py +63 -146
  460. transformers/models/janus/modular_janus.py +62 -20
  461. transformers/models/jetmoe/configuration_jetmoe.py +6 -4
  462. transformers/models/jetmoe/modeling_jetmoe.py +3 -3
  463. transformers/models/jetmoe/modular_jetmoe.py +3 -3
  464. transformers/models/kosmos2/configuration_kosmos2.py +10 -8
  465. transformers/models/kosmos2/modeling_kosmos2.py +56 -34
  466. transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
  467. transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
  468. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
  469. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
  470. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
  471. transformers/models/lasr/configuration_lasr.py +2 -4
  472. transformers/models/lasr/modeling_lasr.py +3 -3
  473. transformers/models/lasr/modular_lasr.py +3 -3
  474. transformers/models/layoutlm/configuration_layoutlm.py +14 -1
  475. transformers/models/layoutlm/modeling_layoutlm.py +3 -3
  476. transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
  477. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
  478. transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
  479. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
  480. transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
  481. transformers/models/led/configuration_led.py +7 -8
  482. transformers/models/levit/image_processing_levit_fast.py +4 -4
  483. transformers/models/lfm2/configuration_lfm2.py +5 -7
  484. transformers/models/lfm2/modeling_lfm2.py +4 -4
  485. transformers/models/lfm2/modular_lfm2.py +3 -3
  486. transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
  487. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
  488. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  489. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
  490. transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
  491. transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
  492. transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
  493. transformers/models/lightglue/modeling_lightglue.py +3 -3
  494. transformers/models/lightglue/modular_lightglue.py +3 -3
  495. transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
  496. transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
  497. transformers/models/lilt/configuration_lilt.py +6 -1
  498. transformers/models/llama/configuration_llama.py +5 -7
  499. transformers/models/llama/modeling_llama.py +4 -4
  500. transformers/models/llama4/configuration_llama4.py +67 -47
  501. transformers/models/llama4/image_processing_llama4_fast.py +3 -3
  502. transformers/models/llama4/modeling_llama4.py +46 -44
  503. transformers/models/llava/configuration_llava.py +10 -0
  504. transformers/models/llava/image_processing_llava_fast.py +3 -3
  505. transformers/models/llava/modeling_llava.py +38 -65
  506. transformers/models/llava_next/configuration_llava_next.py +2 -1
  507. transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
  508. transformers/models/llava_next/modeling_llava_next.py +61 -60
  509. transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
  510. transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
  511. transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
  512. transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
  513. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
  514. transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
  515. transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
  516. transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
  517. transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
  518. transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
  519. transformers/models/longformer/configuration_longformer.py +4 -1
  520. transformers/models/longt5/configuration_longt5.py +9 -6
  521. transformers/models/longt5/modeling_longt5.py +2 -1
  522. transformers/models/luke/configuration_luke.py +8 -1
  523. transformers/models/lw_detr/configuration_lw_detr.py +19 -31
  524. transformers/models/lw_detr/modeling_lw_detr.py +43 -44
  525. transformers/models/lw_detr/modular_lw_detr.py +36 -38
  526. transformers/models/lxmert/configuration_lxmert.py +16 -0
  527. transformers/models/m2m_100/configuration_m2m_100.py +7 -8
  528. transformers/models/m2m_100/modeling_m2m_100.py +3 -3
  529. transformers/models/mamba/configuration_mamba.py +5 -2
  530. transformers/models/mamba/modeling_mamba.py +18 -26
  531. transformers/models/mamba2/configuration_mamba2.py +5 -7
  532. transformers/models/mamba2/modeling_mamba2.py +22 -33
  533. transformers/models/marian/configuration_marian.py +10 -4
  534. transformers/models/marian/modeling_marian.py +4 -4
  535. transformers/models/markuplm/configuration_markuplm.py +4 -6
  536. transformers/models/markuplm/modeling_markuplm.py +3 -3
  537. transformers/models/mask2former/configuration_mask2former.py +12 -47
  538. transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
  539. transformers/models/mask2former/modeling_mask2former.py +18 -12
  540. transformers/models/maskformer/configuration_maskformer.py +14 -45
  541. transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
  542. transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
  543. transformers/models/maskformer/modeling_maskformer.py +15 -9
  544. transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
  545. transformers/models/mbart/configuration_mbart.py +9 -4
  546. transformers/models/mbart/modeling_mbart.py +9 -6
  547. transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
  548. transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
  549. transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
  550. transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
  551. transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
  552. transformers/models/mgp_str/modeling_mgp_str.py +4 -2
  553. transformers/models/mimi/configuration_mimi.py +4 -0
  554. transformers/models/mimi/modeling_mimi.py +40 -36
  555. transformers/models/minimax/configuration_minimax.py +8 -11
  556. transformers/models/minimax/modeling_minimax.py +5 -5
  557. transformers/models/minimax/modular_minimax.py +9 -12
  558. transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
  559. transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
  560. transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
  561. transformers/models/ministral/configuration_ministral.py +5 -7
  562. transformers/models/ministral/modeling_ministral.py +4 -4
  563. transformers/models/ministral/modular_ministral.py +5 -8
  564. transformers/models/ministral3/configuration_ministral3.py +4 -4
  565. transformers/models/ministral3/modeling_ministral3.py +4 -4
  566. transformers/models/ministral3/modular_ministral3.py +3 -3
  567. transformers/models/mistral/configuration_mistral.py +5 -7
  568. transformers/models/mistral/modeling_mistral.py +4 -4
  569. transformers/models/mistral/modular_mistral.py +3 -3
  570. transformers/models/mistral3/configuration_mistral3.py +4 -0
  571. transformers/models/mistral3/modeling_mistral3.py +36 -40
  572. transformers/models/mistral3/modular_mistral3.py +31 -32
  573. transformers/models/mixtral/configuration_mixtral.py +8 -11
  574. transformers/models/mixtral/modeling_mixtral.py +4 -4
  575. transformers/models/mlcd/modeling_mlcd.py +7 -5
  576. transformers/models/mlcd/modular_mlcd.py +7 -5
  577. transformers/models/mllama/configuration_mllama.py +5 -7
  578. transformers/models/mllama/image_processing_mllama_fast.py +6 -5
  579. transformers/models/mllama/modeling_mllama.py +19 -19
  580. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
  581. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
  582. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
  583. transformers/models/mobilebert/configuration_mobilebert.py +4 -1
  584. transformers/models/mobilebert/modeling_mobilebert.py +3 -3
  585. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
  586. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
  587. transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
  588. transformers/models/mobilevit/modeling_mobilevit.py +4 -2
  589. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
  590. transformers/models/modernbert/configuration_modernbert.py +46 -21
  591. transformers/models/modernbert/modeling_modernbert.py +146 -899
  592. transformers/models/modernbert/modular_modernbert.py +185 -908
  593. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
  594. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
  595. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
  596. transformers/models/moonshine/configuration_moonshine.py +12 -7
  597. transformers/models/moonshine/modeling_moonshine.py +7 -7
  598. transformers/models/moonshine/modular_moonshine.py +19 -13
  599. transformers/models/moshi/configuration_moshi.py +28 -2
  600. transformers/models/moshi/modeling_moshi.py +4 -9
  601. transformers/models/mpnet/configuration_mpnet.py +6 -1
  602. transformers/models/mpt/configuration_mpt.py +16 -0
  603. transformers/models/mra/configuration_mra.py +8 -1
  604. transformers/models/mt5/configuration_mt5.py +9 -5
  605. transformers/models/mt5/modeling_mt5.py +5 -8
  606. transformers/models/musicgen/configuration_musicgen.py +12 -7
  607. transformers/models/musicgen/modeling_musicgen.py +6 -5
  608. transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
  609. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
  610. transformers/models/mvp/configuration_mvp.py +8 -4
  611. transformers/models/mvp/modeling_mvp.py +6 -4
  612. transformers/models/nanochat/configuration_nanochat.py +5 -7
  613. transformers/models/nanochat/modeling_nanochat.py +4 -4
  614. transformers/models/nanochat/modular_nanochat.py +4 -4
  615. transformers/models/nemotron/configuration_nemotron.py +5 -7
  616. transformers/models/nemotron/modeling_nemotron.py +4 -14
  617. transformers/models/nllb/tokenization_nllb.py +7 -5
  618. transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
  619. transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
  620. transformers/models/nougat/image_processing_nougat_fast.py +8 -8
  621. transformers/models/nystromformer/configuration_nystromformer.py +8 -1
  622. transformers/models/olmo/configuration_olmo.py +5 -7
  623. transformers/models/olmo/modeling_olmo.py +4 -4
  624. transformers/models/olmo/modular_olmo.py +3 -3
  625. transformers/models/olmo2/configuration_olmo2.py +9 -11
  626. transformers/models/olmo2/modeling_olmo2.py +4 -4
  627. transformers/models/olmo2/modular_olmo2.py +7 -7
  628. transformers/models/olmo3/configuration_olmo3.py +10 -11
  629. transformers/models/olmo3/modeling_olmo3.py +4 -4
  630. transformers/models/olmo3/modular_olmo3.py +13 -14
  631. transformers/models/olmoe/configuration_olmoe.py +5 -7
  632. transformers/models/olmoe/modeling_olmoe.py +4 -4
  633. transformers/models/olmoe/modular_olmoe.py +3 -3
  634. transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
  635. transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
  636. transformers/models/oneformer/configuration_oneformer.py +9 -46
  637. transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
  638. transformers/models/oneformer/modeling_oneformer.py +14 -9
  639. transformers/models/openai/configuration_openai.py +16 -0
  640. transformers/models/opt/configuration_opt.py +6 -6
  641. transformers/models/opt/modeling_opt.py +5 -5
  642. transformers/models/ovis2/configuration_ovis2.py +4 -0
  643. transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
  644. transformers/models/ovis2/modeling_ovis2.py +58 -99
  645. transformers/models/ovis2/modular_ovis2.py +52 -13
  646. transformers/models/owlv2/configuration_owlv2.py +4 -1
  647. transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
  648. transformers/models/owlv2/modeling_owlv2.py +40 -27
  649. transformers/models/owlv2/modular_owlv2.py +5 -5
  650. transformers/models/owlvit/configuration_owlvit.py +4 -1
  651. transformers/models/owlvit/modeling_owlvit.py +40 -27
  652. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
  653. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
  654. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
  655. transformers/models/paligemma/configuration_paligemma.py +4 -0
  656. transformers/models/paligemma/modeling_paligemma.py +30 -26
  657. transformers/models/parakeet/configuration_parakeet.py +2 -4
  658. transformers/models/parakeet/modeling_parakeet.py +3 -3
  659. transformers/models/parakeet/modular_parakeet.py +3 -3
  660. transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
  661. transformers/models/patchtst/modeling_patchtst.py +3 -3
  662. transformers/models/pe_audio/modeling_pe_audio.py +4 -4
  663. transformers/models/pe_audio/modular_pe_audio.py +1 -1
  664. transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
  665. transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
  666. transformers/models/pe_video/modeling_pe_video.py +36 -24
  667. transformers/models/pe_video/modular_pe_video.py +36 -23
  668. transformers/models/pegasus/configuration_pegasus.py +8 -5
  669. transformers/models/pegasus/modeling_pegasus.py +4 -4
  670. transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
  671. transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
  672. transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
  673. transformers/models/perceiver/modeling_perceiver.py +17 -9
  674. transformers/models/perception_lm/modeling_perception_lm.py +26 -27
  675. transformers/models/perception_lm/modular_perception_lm.py +27 -25
  676. transformers/models/persimmon/configuration_persimmon.py +5 -7
  677. transformers/models/persimmon/modeling_persimmon.py +5 -5
  678. transformers/models/phi/configuration_phi.py +8 -6
  679. transformers/models/phi/modeling_phi.py +4 -4
  680. transformers/models/phi/modular_phi.py +3 -3
  681. transformers/models/phi3/configuration_phi3.py +9 -11
  682. transformers/models/phi3/modeling_phi3.py +4 -4
  683. transformers/models/phi3/modular_phi3.py +3 -3
  684. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
  685. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
  686. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
  687. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
  688. transformers/models/phimoe/configuration_phimoe.py +5 -7
  689. transformers/models/phimoe/modeling_phimoe.py +15 -39
  690. transformers/models/phimoe/modular_phimoe.py +12 -7
  691. transformers/models/pix2struct/configuration_pix2struct.py +12 -9
  692. transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
  693. transformers/models/pix2struct/modeling_pix2struct.py +14 -7
  694. transformers/models/pixio/configuration_pixio.py +2 -4
  695. transformers/models/pixio/modeling_pixio.py +9 -8
  696. transformers/models/pixio/modular_pixio.py +4 -2
  697. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
  698. transformers/models/pixtral/modeling_pixtral.py +9 -12
  699. transformers/models/plbart/configuration_plbart.py +8 -5
  700. transformers/models/plbart/modeling_plbart.py +9 -7
  701. transformers/models/plbart/modular_plbart.py +1 -1
  702. transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
  703. transformers/models/pop2piano/configuration_pop2piano.py +7 -6
  704. transformers/models/pop2piano/modeling_pop2piano.py +2 -1
  705. transformers/models/pp_doclayout_v3/__init__.py +30 -0
  706. transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
  707. transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
  708. transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
  709. transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
  710. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
  711. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
  712. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
  713. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
  714. transformers/models/prophetnet/configuration_prophetnet.py +11 -10
  715. transformers/models/prophetnet/modeling_prophetnet.py +12 -23
  716. transformers/models/pvt/image_processing_pvt.py +7 -7
  717. transformers/models/pvt/image_processing_pvt_fast.py +1 -1
  718. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  719. transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
  720. transformers/models/qwen2/configuration_qwen2.py +14 -4
  721. transformers/models/qwen2/modeling_qwen2.py +4 -4
  722. transformers/models/qwen2/modular_qwen2.py +3 -3
  723. transformers/models/qwen2/tokenization_qwen2.py +0 -4
  724. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
  725. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
  726. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
  727. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
  728. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
  729. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
  730. transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
  731. transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
  732. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  733. transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
  734. transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
  735. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
  736. transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
  737. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
  738. transformers/models/qwen3/configuration_qwen3.py +15 -5
  739. transformers/models/qwen3/modeling_qwen3.py +4 -4
  740. transformers/models/qwen3/modular_qwen3.py +3 -3
  741. transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
  742. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  743. transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
  744. transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
  745. transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
  746. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
  747. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
  748. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
  749. transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
  750. transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
  751. transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
  752. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
  753. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
  754. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
  755. transformers/models/rag/configuration_rag.py +6 -6
  756. transformers/models/rag/modeling_rag.py +3 -3
  757. transformers/models/rag/retrieval_rag.py +1 -1
  758. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
  759. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
  760. transformers/models/reformer/configuration_reformer.py +7 -7
  761. transformers/models/rembert/configuration_rembert.py +8 -1
  762. transformers/models/rembert/modeling_rembert.py +0 -22
  763. transformers/models/resnet/configuration_resnet.py +2 -4
  764. transformers/models/resnet/modeling_resnet.py +6 -5
  765. transformers/models/roberta/configuration_roberta.py +11 -2
  766. transformers/models/roberta/modeling_roberta.py +6 -6
  767. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
  768. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
  769. transformers/models/roc_bert/configuration_roc_bert.py +8 -1
  770. transformers/models/roc_bert/modeling_roc_bert.py +6 -41
  771. transformers/models/roformer/configuration_roformer.py +13 -2
  772. transformers/models/roformer/modeling_roformer.py +0 -14
  773. transformers/models/rt_detr/configuration_rt_detr.py +8 -49
  774. transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
  775. transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
  776. transformers/models/rt_detr/modeling_rt_detr.py +578 -737
  777. transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
  778. transformers/models/rt_detr/modular_rt_detr.py +1508 -6
  779. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
  780. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
  781. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
  782. transformers/models/rwkv/configuration_rwkv.py +2 -3
  783. transformers/models/rwkv/modeling_rwkv.py +0 -23
  784. transformers/models/sam/configuration_sam.py +2 -0
  785. transformers/models/sam/image_processing_sam_fast.py +4 -4
  786. transformers/models/sam/modeling_sam.py +13 -8
  787. transformers/models/sam/processing_sam.py +3 -3
  788. transformers/models/sam2/configuration_sam2.py +1 -1
  789. transformers/models/sam2/modeling_sam2.py +56 -52
  790. transformers/models/sam2/modular_sam2.py +47 -55
  791. transformers/models/sam2_video/modeling_sam2_video.py +50 -51
  792. transformers/models/sam2_video/modular_sam2_video.py +12 -10
  793. transformers/models/sam3/modeling_sam3.py +43 -47
  794. transformers/models/sam3/processing_sam3.py +8 -4
  795. transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
  796. transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
  797. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
  798. transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
  799. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
  800. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
  801. transformers/models/sam3_video/modeling_sam3_video.py +27 -14
  802. transformers/models/sam_hq/configuration_sam_hq.py +2 -0
  803. transformers/models/sam_hq/modeling_sam_hq.py +13 -9
  804. transformers/models/sam_hq/modular_sam_hq.py +6 -6
  805. transformers/models/sam_hq/processing_sam_hq.py +7 -6
  806. transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
  807. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
  808. transformers/models/seed_oss/configuration_seed_oss.py +7 -9
  809. transformers/models/seed_oss/modeling_seed_oss.py +4 -4
  810. transformers/models/seed_oss/modular_seed_oss.py +3 -3
  811. transformers/models/segformer/image_processing_segformer_fast.py +4 -4
  812. transformers/models/segformer/modeling_segformer.py +4 -2
  813. transformers/models/segformer/modular_segformer.py +3 -3
  814. transformers/models/seggpt/modeling_seggpt.py +20 -8
  815. transformers/models/sew/configuration_sew.py +4 -1
  816. transformers/models/sew/modeling_sew.py +9 -5
  817. transformers/models/sew/modular_sew.py +2 -1
  818. transformers/models/sew_d/configuration_sew_d.py +4 -1
  819. transformers/models/sew_d/modeling_sew_d.py +4 -1
  820. transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
  821. transformers/models/siglip/configuration_siglip.py +4 -1
  822. transformers/models/siglip/modeling_siglip.py +27 -71
  823. transformers/models/siglip2/__init__.py +1 -0
  824. transformers/models/siglip2/configuration_siglip2.py +4 -2
  825. transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
  826. transformers/models/siglip2/modeling_siglip2.py +37 -78
  827. transformers/models/siglip2/modular_siglip2.py +74 -25
  828. transformers/models/siglip2/tokenization_siglip2.py +95 -0
  829. transformers/models/smollm3/configuration_smollm3.py +6 -6
  830. transformers/models/smollm3/modeling_smollm3.py +4 -4
  831. transformers/models/smollm3/modular_smollm3.py +9 -9
  832. transformers/models/smolvlm/configuration_smolvlm.py +1 -3
  833. transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
  834. transformers/models/smolvlm/modeling_smolvlm.py +75 -46
  835. transformers/models/smolvlm/modular_smolvlm.py +36 -23
  836. transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
  837. transformers/models/solar_open/__init__.py +27 -0
  838. transformers/models/solar_open/configuration_solar_open.py +184 -0
  839. transformers/models/solar_open/modeling_solar_open.py +642 -0
  840. transformers/models/solar_open/modular_solar_open.py +224 -0
  841. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
  842. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
  843. transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
  844. transformers/models/speecht5/configuration_speecht5.py +7 -8
  845. transformers/models/splinter/configuration_splinter.py +6 -6
  846. transformers/models/splinter/modeling_splinter.py +8 -3
  847. transformers/models/squeezebert/configuration_squeezebert.py +14 -1
  848. transformers/models/stablelm/configuration_stablelm.py +8 -6
  849. transformers/models/stablelm/modeling_stablelm.py +5 -5
  850. transformers/models/starcoder2/configuration_starcoder2.py +11 -5
  851. transformers/models/starcoder2/modeling_starcoder2.py +5 -5
  852. transformers/models/starcoder2/modular_starcoder2.py +4 -4
  853. transformers/models/superglue/configuration_superglue.py +4 -0
  854. transformers/models/superglue/image_processing_superglue_fast.py +4 -3
  855. transformers/models/superglue/modeling_superglue.py +9 -4
  856. transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
  857. transformers/models/superpoint/modeling_superpoint.py +4 -2
  858. transformers/models/swin/configuration_swin.py +2 -4
  859. transformers/models/swin/modeling_swin.py +11 -8
  860. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
  861. transformers/models/swin2sr/modeling_swin2sr.py +4 -2
  862. transformers/models/swinv2/configuration_swinv2.py +2 -4
  863. transformers/models/swinv2/modeling_swinv2.py +10 -7
  864. transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
  865. transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
  866. transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
  867. transformers/models/t5/configuration_t5.py +9 -8
  868. transformers/models/t5/modeling_t5.py +5 -8
  869. transformers/models/t5gemma/configuration_t5gemma.py +10 -25
  870. transformers/models/t5gemma/modeling_t5gemma.py +9 -9
  871. transformers/models/t5gemma/modular_t5gemma.py +11 -24
  872. transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
  873. transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
  874. transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
  875. transformers/models/table_transformer/configuration_table_transformer.py +18 -49
  876. transformers/models/table_transformer/modeling_table_transformer.py +27 -53
  877. transformers/models/tapas/configuration_tapas.py +12 -1
  878. transformers/models/tapas/modeling_tapas.py +1 -1
  879. transformers/models/tapas/tokenization_tapas.py +1 -0
  880. transformers/models/textnet/configuration_textnet.py +4 -6
  881. transformers/models/textnet/image_processing_textnet_fast.py +3 -3
  882. transformers/models/textnet/modeling_textnet.py +15 -14
  883. transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
  884. transformers/models/timesfm/modeling_timesfm.py +5 -6
  885. transformers/models/timesfm/modular_timesfm.py +5 -6
  886. transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
  887. transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
  888. transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
  889. transformers/models/trocr/configuration_trocr.py +11 -7
  890. transformers/models/trocr/modeling_trocr.py +4 -2
  891. transformers/models/tvp/configuration_tvp.py +10 -35
  892. transformers/models/tvp/image_processing_tvp_fast.py +6 -5
  893. transformers/models/tvp/modeling_tvp.py +1 -1
  894. transformers/models/udop/configuration_udop.py +16 -7
  895. transformers/models/udop/modeling_udop.py +10 -6
  896. transformers/models/umt5/configuration_umt5.py +8 -6
  897. transformers/models/umt5/modeling_umt5.py +7 -3
  898. transformers/models/unispeech/configuration_unispeech.py +4 -1
  899. transformers/models/unispeech/modeling_unispeech.py +7 -4
  900. transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
  901. transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
  902. transformers/models/upernet/configuration_upernet.py +8 -35
  903. transformers/models/upernet/modeling_upernet.py +1 -1
  904. transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
  905. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  906. transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
  907. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
  908. transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
  909. transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
  910. transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
  911. transformers/models/video_llava/configuration_video_llava.py +4 -0
  912. transformers/models/video_llava/modeling_video_llava.py +87 -89
  913. transformers/models/videomae/modeling_videomae.py +4 -5
  914. transformers/models/vilt/configuration_vilt.py +4 -1
  915. transformers/models/vilt/image_processing_vilt_fast.py +6 -6
  916. transformers/models/vilt/modeling_vilt.py +27 -12
  917. transformers/models/vipllava/configuration_vipllava.py +4 -0
  918. transformers/models/vipllava/modeling_vipllava.py +57 -31
  919. transformers/models/vipllava/modular_vipllava.py +50 -24
  920. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
  921. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
  922. transformers/models/visual_bert/configuration_visual_bert.py +6 -1
  923. transformers/models/vit/configuration_vit.py +2 -2
  924. transformers/models/vit/modeling_vit.py +7 -5
  925. transformers/models/vit_mae/modeling_vit_mae.py +11 -7
  926. transformers/models/vit_msn/modeling_vit_msn.py +11 -7
  927. transformers/models/vitdet/configuration_vitdet.py +2 -4
  928. transformers/models/vitdet/modeling_vitdet.py +2 -3
  929. transformers/models/vitmatte/configuration_vitmatte.py +6 -35
  930. transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
  931. transformers/models/vitmatte/modeling_vitmatte.py +1 -1
  932. transformers/models/vitpose/configuration_vitpose.py +6 -43
  933. transformers/models/vitpose/modeling_vitpose.py +5 -3
  934. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
  935. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
  936. transformers/models/vits/configuration_vits.py +4 -0
  937. transformers/models/vits/modeling_vits.py +9 -7
  938. transformers/models/vivit/modeling_vivit.py +4 -4
  939. transformers/models/vjepa2/modeling_vjepa2.py +9 -9
  940. transformers/models/voxtral/configuration_voxtral.py +0 -1
  941. transformers/models/voxtral/modeling_voxtral.py +25 -24
  942. transformers/models/voxtral/modular_voxtral.py +26 -20
  943. transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
  944. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
  945. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
  946. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
  947. transformers/models/wavlm/configuration_wavlm.py +4 -1
  948. transformers/models/wavlm/modeling_wavlm.py +4 -1
  949. transformers/models/whisper/configuration_whisper.py +6 -4
  950. transformers/models/whisper/generation_whisper.py +0 -1
  951. transformers/models/whisper/modeling_whisper.py +3 -3
  952. transformers/models/x_clip/configuration_x_clip.py +4 -1
  953. transformers/models/x_clip/modeling_x_clip.py +26 -27
  954. transformers/models/xglm/configuration_xglm.py +9 -7
  955. transformers/models/xlm/configuration_xlm.py +10 -7
  956. transformers/models/xlm/modeling_xlm.py +1 -1
  957. transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
  958. transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
  959. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
  960. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
  961. transformers/models/xlnet/configuration_xlnet.py +3 -1
  962. transformers/models/xlstm/configuration_xlstm.py +5 -7
  963. transformers/models/xlstm/modeling_xlstm.py +0 -32
  964. transformers/models/xmod/configuration_xmod.py +11 -2
  965. transformers/models/xmod/modeling_xmod.py +13 -16
  966. transformers/models/yolos/image_processing_yolos_fast.py +25 -28
  967. transformers/models/yolos/modeling_yolos.py +7 -7
  968. transformers/models/yolos/modular_yolos.py +16 -16
  969. transformers/models/yoso/configuration_yoso.py +8 -1
  970. transformers/models/youtu/__init__.py +27 -0
  971. transformers/models/youtu/configuration_youtu.py +194 -0
  972. transformers/models/youtu/modeling_youtu.py +619 -0
  973. transformers/models/youtu/modular_youtu.py +254 -0
  974. transformers/models/zamba/configuration_zamba.py +5 -7
  975. transformers/models/zamba/modeling_zamba.py +25 -56
  976. transformers/models/zamba2/configuration_zamba2.py +8 -13
  977. transformers/models/zamba2/modeling_zamba2.py +53 -78
  978. transformers/models/zamba2/modular_zamba2.py +36 -29
  979. transformers/models/zoedepth/configuration_zoedepth.py +17 -40
  980. transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
  981. transformers/models/zoedepth/modeling_zoedepth.py +5 -3
  982. transformers/pipelines/__init__.py +1 -61
  983. transformers/pipelines/any_to_any.py +1 -1
  984. transformers/pipelines/automatic_speech_recognition.py +0 -2
  985. transformers/pipelines/base.py +1 -1
  986. transformers/pipelines/image_text_to_text.py +1 -1
  987. transformers/pipelines/text_to_audio.py +5 -1
  988. transformers/processing_utils.py +35 -44
  989. transformers/pytorch_utils.py +2 -26
  990. transformers/quantizers/quantizer_compressed_tensors.py +7 -5
  991. transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
  992. transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
  993. transformers/quantizers/quantizer_mxfp4.py +1 -1
  994. transformers/quantizers/quantizer_torchao.py +0 -16
  995. transformers/safetensors_conversion.py +11 -4
  996. transformers/testing_utils.py +3 -28
  997. transformers/tokenization_mistral_common.py +9 -0
  998. transformers/tokenization_python.py +6 -4
  999. transformers/tokenization_utils_base.py +119 -219
  1000. transformers/tokenization_utils_tokenizers.py +31 -2
  1001. transformers/trainer.py +25 -33
  1002. transformers/trainer_seq2seq.py +1 -1
  1003. transformers/training_args.py +411 -417
  1004. transformers/utils/__init__.py +1 -4
  1005. transformers/utils/auto_docstring.py +15 -18
  1006. transformers/utils/backbone_utils.py +13 -373
  1007. transformers/utils/doc.py +4 -36
  1008. transformers/utils/generic.py +69 -33
  1009. transformers/utils/import_utils.py +72 -75
  1010. transformers/utils/loading_report.py +133 -105
  1011. transformers/utils/quantization_config.py +0 -21
  1012. transformers/video_processing_utils.py +5 -5
  1013. transformers/video_utils.py +3 -1
  1014. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
  1015. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
  1016. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
  1017. transformers/pipelines/deprecated/text2text_generation.py +0 -408
  1018. transformers/pipelines/image_to_text.py +0 -189
  1019. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
  1020. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
  1021. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
@@ -209,8 +209,6 @@ class Qwen3OmniMoeTextConfig(PreTrainedConfig):
209
209
  use_cache (`bool`, *optional*, defaults to `True`):
210
210
  Whether or not the model should return the last key/values attentions (not used by all models). Only
211
211
  relevant if `config.is_decoder=True`.
212
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
213
- Whether the model's input and output word embeddings should be tied.
214
212
  rope_parameters (`RopeParameters`, *optional*):
215
213
  Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
216
214
  a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
@@ -242,6 +240,12 @@ class Qwen3OmniMoeTextConfig(PreTrainedConfig):
242
240
  Indicate which layers use Qwen3OmniMoeTextMLP rather than Qwen3OmniMoeTextSparseMoeBlock
243
241
  The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
244
242
  If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
243
+ pad_token_id (`int`, *optional*):
244
+ Padding token id.
245
+ bos_token_id (`int`, *optional*):
246
+ Beginning of stream token id.
247
+ eos_token_id (`int`, *optional*):
248
+ End of stream token id.
245
249
 
246
250
  ```python
247
251
  >>> from transformers import Qwen3OmniMoeTextModel, Qwen3OmniMoeTextConfig
@@ -266,9 +270,8 @@ class Qwen3OmniMoeTextConfig(PreTrainedConfig):
266
270
  "layers.*.self_attn.k_proj": "colwise",
267
271
  "layers.*.self_attn.v_proj": "colwise",
268
272
  "layers.*.self_attn.o_proj": "rowwise",
269
- "layers.*.mlp.experts.gate_up_proj": "local_rowwise",
270
- "layers.*.mlp.experts.down_proj": "local_rowwise",
271
- "layers.*.mlp.experts": "gather",
273
+ "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
274
+ "layers.*.mlp.experts.down_proj": "rowwise",
272
275
  "layers.*.mlp.gate_proj": "colwise",
273
276
  "layers.*.mlp.up_proj": "colwise",
274
277
  "layers.*.mlp.down_proj": "rowwise",
@@ -292,7 +295,6 @@ class Qwen3OmniMoeTextConfig(PreTrainedConfig):
292
295
  initializer_range: float | None = 0.02,
293
296
  rms_norm_eps: float | None = 1e-6,
294
297
  use_cache: bool | None = True,
295
- tie_word_embeddings: bool | None = False,
296
298
  rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
297
299
  attention_bias: bool | None = False,
298
300
  sliding_window: int | None = None,
@@ -305,6 +307,9 @@ class Qwen3OmniMoeTextConfig(PreTrainedConfig):
305
307
  output_router_logits: bool | None = False,
306
308
  router_aux_loss_coef: float | None = 0.001,
307
309
  mlp_only_layers: list[int] | None = None,
310
+ pad_token_id: int | None = None,
311
+ bos_token_id: int | None = None,
312
+ eos_token_id: int | None = None,
308
313
  **kwargs,
309
314
  ):
310
315
  self.vocab_size = vocab_size
@@ -333,9 +338,11 @@ class Qwen3OmniMoeTextConfig(PreTrainedConfig):
333
338
  self.output_router_logits = output_router_logits
334
339
  self.router_aux_loss_coef = router_aux_loss_coef
335
340
  self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
341
+ self.pad_token_id = pad_token_id
342
+ self.bos_token_id = bos_token_id
343
+ self.eos_token_id = eos_token_id
336
344
 
337
345
  super().__init__(
338
- tie_word_embeddings=tie_word_embeddings,
339
346
  ignore_keys_at_rope_validation={"mrope_section", "interleaved", "mrope_interleaved"},
340
347
  **kwargs,
341
348
  )
@@ -374,6 +381,8 @@ class Qwen3OmniMoeThinkerConfig(PreTrainedConfig):
374
381
  The user token id to encode the user token.
375
382
  initializer_range (`float`, *optional*, defaults to 0.02):
376
383
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
384
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
385
+ Whether the model's input and output word embeddings should be tied.
377
386
 
378
387
  Example:
379
388
 
@@ -411,12 +420,14 @@ class Qwen3OmniMoeThinkerConfig(PreTrainedConfig):
411
420
  audio_start_token_id=151647,
412
421
  user_token_id=872,
413
422
  initializer_range=0.02,
423
+ tie_word_embeddings=False,
414
424
  **kwargs,
415
425
  ):
416
426
  self.user_token_id = user_token_id
417
427
  self.position_id_per_seconds = position_id_per_seconds
418
428
  self.audio_start_token_id = audio_start_token_id
419
429
  self.initializer_range = initializer_range
430
+ self.tie_word_embeddings = tie_word_embeddings
420
431
 
421
432
  if isinstance(vision_config, dict):
422
433
  vision_config = Qwen3OmniMoeVisionEncoderConfig(**vision_config)
@@ -504,6 +515,12 @@ class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig):
504
515
  Attention pattern for each layer.
505
516
  attention_dropout (`float`, *optional*, defaults to 0.0):
506
517
  The dropout ratio for the attention probabilities.
518
+ pad_token_id (`int`, *optional*):
519
+ Padding token id.
520
+ bos_token_id (`int`, *optional*):
521
+ Beginning of stream token id.
522
+ eos_token_id (`int`, *optional*):
523
+ End of stream token id.
507
524
 
508
525
  ```python
509
526
  >>> from transformers import Qwen3OmniMoeTalkerCodePredictorModel, Qwen3OmniMoeTalkerCodePredictorConfig
@@ -555,11 +572,16 @@ class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig):
555
572
  rope_parameters: int | None = None,
556
573
  attention_bias: bool | None = False,
557
574
  sliding_window: int | None = None,
575
+ max_window_layers: int | None = 28,
558
576
  layer_types: list[str] | None = None,
559
577
  attention_dropout: int | None = 0,
560
578
  num_code_groups: int | None = 32,
579
+ pad_token_id: int | None = None,
580
+ bos_token_id: int | None = None,
581
+ eos_token_id: int | None = None,
561
582
  **kwargs,
562
583
  ):
584
+ self.num_code_groups = num_code_groups
563
585
  self.vocab_size = vocab_size
564
586
  self.max_position_embeddings = max_position_embeddings
565
587
  self.hidden_size = hidden_size
@@ -567,6 +589,7 @@ class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig):
567
589
  self.num_hidden_layers = num_hidden_layers
568
590
  self.num_attention_heads = num_attention_heads
569
591
  self.sliding_window = sliding_window
592
+ self.max_window_layers = max_window_layers
570
593
 
571
594
  # for backward compatibility
572
595
  if num_key_value_heads is None:
@@ -591,13 +614,13 @@ class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig):
591
614
  ]
592
615
  layer_type_validation(self.layer_types, self.num_hidden_layers)
593
616
 
617
+ self.pad_token_id = pad_token_id
618
+ self.bos_token_id = bos_token_id
619
+ self.eos_token_id = eos_token_id
620
+ self.tie_word_embeddings = tie_word_embeddings
594
621
  self.rope_parameters = rope_parameters
595
622
 
596
- super().__init__(
597
- tie_word_embeddings=tie_word_embeddings,
598
- **kwargs,
599
- )
600
- self.num_code_groups = num_code_groups
623
+ super().__init__(**kwargs)
601
624
 
602
625
 
603
626
  class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig):
@@ -674,6 +697,12 @@ class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig):
674
697
  Indicate which layers use Qwen3OmniMoeTalkerTextMLP rather than Qwen3OmniMoeTalkerTextSparseMoeBlock
675
698
  The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
676
699
  If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
700
+ pad_token_id (`int`, *optional*):
701
+ Padding token id.
702
+ bos_token_id (`int`, *optional*):
703
+ Beginning of stream token id.
704
+ eos_token_id (`int`, *optional*):
705
+ End of stream token id.
677
706
 
678
707
  ```python
679
708
  >>> from transformers import Qwen3OmniMoeTalkerTextModel, Qwen3OmniMoeTalkerTextConfig
@@ -691,15 +720,18 @@ class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig):
691
720
  model_type = "qwen3_omni_moe_talker_text"
692
721
  keys_to_ignore_at_inference = ["past_key_values"]
693
722
 
723
+ attribute_map = {
724
+ "num_experts": "num_local_experts",
725
+ }
726
+
694
727
  # Default tensor parallel plan for base model `Qwen3OmniMoeTalkerText`
695
728
  base_model_tp_plan = {
696
729
  "layers.*.self_attn.q_proj": "colwise",
697
730
  "layers.*.self_attn.k_proj": "colwise",
698
731
  "layers.*.self_attn.v_proj": "colwise",
699
732
  "layers.*.self_attn.o_proj": "rowwise",
700
- "layers.*.mlp.experts.gate_up_proj": "local_rowwise",
701
- "layers.*.mlp.experts.down_proj": "local_rowwise",
702
- "layers.*.mlp.experts": "gather",
733
+ "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
734
+ "layers.*.mlp.experts.down_proj": "rowwise",
703
735
  "layers.*.mlp.gate_proj": "colwise",
704
736
  "layers.*.mlp.up_proj": "colwise",
705
737
  "layers.*.mlp.down_proj": "rowwise",
@@ -736,6 +768,9 @@ class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig):
736
768
  output_router_logits: bool | None = False,
737
769
  router_aux_loss_coef: float | None = 0.001,
738
770
  mlp_only_layers: list[int] | None = None,
771
+ pad_token_id: int | None = None,
772
+ bos_token_id: int | None = None,
773
+ eos_token_id: int | None = None,
739
774
  **kwargs,
740
775
  ):
741
776
  self.vocab_size = vocab_size
@@ -765,10 +800,11 @@ class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig):
765
800
  self.router_aux_loss_coef = router_aux_loss_coef
766
801
  self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
767
802
 
768
- super().__init__(
769
- tie_word_embeddings=tie_word_embeddings,
770
- **kwargs,
771
- )
803
+ self.pad_token_id = pad_token_id
804
+ self.bos_token_id = bos_token_id
805
+ self.eos_token_id = eos_token_id
806
+ self.tie_word_embeddings = tie_word_embeddings
807
+ super().__init__(**kwargs)
772
808
 
773
809
 
774
810
  class Qwen3OmniMoeTalkerConfig(PreTrainedConfig):
@@ -44,8 +44,8 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_ma
44
44
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
45
45
  from ...modeling_layers import GradientCheckpointingLayer
46
46
  from ...modeling_outputs import (
47
- BaseModelOutput,
48
47
  BaseModelOutputWithPast,
48
+ BaseModelOutputWithPooling,
49
49
  CausalLMOutputWithPast,
50
50
  MoeCausalLMOutputWithPast,
51
51
  MoeModelOutputWithPast,
@@ -53,8 +53,14 @@ from ...modeling_outputs import (
53
53
  from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
54
54
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
55
55
  from ...processing_utils import Unpack
56
- from ...utils import auto_docstring, can_return_tuple, is_grouped_mm_available
57
- from ...utils.generic import OutputRecorder, TransformersKwargs, check_model_inputs, maybe_autocast
56
+ from ...utils import auto_docstring, can_return_tuple, is_grouped_mm_available, torch_compilable_check
57
+ from ...utils.generic import (
58
+ OutputRecorder,
59
+ TransformersKwargs,
60
+ check_model_inputs,
61
+ is_flash_attention_requested,
62
+ maybe_autocast,
63
+ )
58
64
  from .configuration_qwen3_omni_moe import (
59
65
  Qwen3OmniMoeAudioEncoderConfig,
60
66
  Qwen3OmniMoeCode2WavConfig,
@@ -68,6 +74,17 @@ from .configuration_qwen3_omni_moe import (
68
74
  )
69
75
 
70
76
 
77
+ @dataclass
78
+ @auto_docstring
79
+ class BaseModelOutputWithDeepstackFeatures(BaseModelOutputWithPooling):
80
+ r"""
81
+ deepstack_features (`List[torch.FloatTensor]`, *optional*):
82
+ List of hidden-states (feature maps) from deepstack layers.
83
+ """
84
+
85
+ deepstack_features: list[torch.FloatTensor] | None = None
86
+
87
+
71
88
  class SinusoidsPositionEmbedding(nn.Module):
72
89
  def __init__(self, length, channels, max_timescale=10000):
73
90
  super().__init__()
@@ -578,9 +595,9 @@ class Qwen3OmniMoeAudioAttention(nn.Module):
578
595
  value_states = value_states.transpose(0, 1).unsqueeze(0)
579
596
  max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
580
597
 
581
- attention_interface: Callable = eager_attention_forward
582
- if self.config._attn_implementation != "eager":
583
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
598
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
599
+ self.config._attn_implementation, eager_attention_forward
600
+ )
584
601
 
585
602
  attn_output, _ = attention_interface(
586
603
  self,
@@ -670,6 +687,10 @@ class Qwen3OmniMoeAudioEncoder(Qwen3OmniMoePreTrainedModel):
670
687
  input_modalities = "audio"
671
688
  _no_split_modules = ["Qwen3OmniMoeAudioEncoderLayer"]
672
689
  _supports_sdpa = True
690
+ _can_record_outputs = {
691
+ "hidden_states": Qwen3OmniMoeAudioEncoderLayer,
692
+ "attentions": Qwen3OmniMoeAudioAttention,
693
+ }
673
694
 
674
695
  def __init__(self, config: Qwen3OmniMoeAudioEncoderConfig):
675
696
  super().__init__(config)
@@ -716,7 +737,7 @@ class Qwen3OmniMoeAudioEncoder(Qwen3OmniMoePreTrainedModel):
716
737
  # NOTE: the created attention masl only approximates the ragged FA2 attention by
717
738
  # allowing bidirectional attention within `cu_seqlens` blocks, and not attending between
718
739
  # blocks. Though it will not be a 100% match for FA2's `varlen` path
719
- if self.config._attn_implementation == "flash_attention_2":
740
+ if is_flash_attention_requested(self.config):
720
741
  return None
721
742
 
722
743
  seq_length = inputs_tensor.shape[0]
@@ -730,6 +751,7 @@ class Qwen3OmniMoeAudioEncoder(Qwen3OmniMoePreTrainedModel):
730
751
  attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
731
752
  return attention_mask
732
753
 
754
+ @check_model_inputs(tie_last_hidden_states=False)
733
755
  @auto_docstring
734
756
  def forward(
735
757
  self,
@@ -747,11 +769,7 @@ class Qwen3OmniMoeAudioEncoder(Qwen3OmniMoePreTrainedModel):
747
769
  aftercnn_lens = _get_feat_extract_output_lengths(feature_lens)
748
770
  chunk_num = torch.ceil(feature_lens / (self.n_window * 2)).long()
749
771
 
750
- chunk_lengths = torch.tensor(
751
- [self.n_window * 2] * chunk_num.sum(),
752
- dtype=torch.long,
753
- device=feature_lens.device,
754
- )
772
+ chunk_lengths = torch.full((chunk_num.sum(),), self.n_window * 2, dtype=torch.long, device=feature_lens.device)
755
773
  tail_chunk_index = F.pad(chunk_num, (1, 0), value=-1).cumsum(0)[1:]
756
774
  chunk_lengths[tail_chunk_index] = feature_lens % (self.n_window * 2)
757
775
  chunk_lengths[chunk_lengths == 0] = self.n_window * 2
@@ -803,7 +821,7 @@ class Qwen3OmniMoeAudioEncoder(Qwen3OmniMoePreTrainedModel):
803
821
  hidden_states = self.proj1(hidden_states)
804
822
  hidden_states = self.act(hidden_states)
805
823
  hidden_states = self.proj2(hidden_states)
806
- return BaseModelOutput(last_hidden_state=hidden_states)
824
+ return BaseModelOutputWithPooling(last_hidden_state=hidden_states)
807
825
 
808
826
  def padded_and_mask_function(self, tensor_list, tensor_len, padding_value=0, padding_side="right"):
809
827
  """
@@ -907,11 +925,11 @@ class Qwen3OmniMoeVisionAttention(nn.Module):
907
925
  key_states = key_states.transpose(0, 1).unsqueeze(0)
908
926
  value_states = value_states.transpose(0, 1).unsqueeze(0)
909
927
 
910
- attention_interface: Callable = eager_attention_forward
911
- if self.config._attn_implementation != "eager":
912
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
928
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
929
+ self.config._attn_implementation, eager_attention_forward
930
+ )
913
931
 
914
- if "flash" in self.config._attn_implementation:
932
+ if is_flash_attention_requested(self.config):
915
933
  # Flash Attention: Use cu_seqlens for variable length attention
916
934
  max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
917
935
  attn_output, _ = attention_interface(
@@ -996,6 +1014,25 @@ class Qwen3OmniMoeVisionRotaryEmbedding(nn.Module):
996
1014
  return freqs
997
1015
 
998
1016
 
1017
+ class Qwen3OmniMoeTextTopKRouter(nn.Module):
1018
+ def __init__(self, config):
1019
+ super().__init__()
1020
+ self.top_k = config.num_experts_per_tok
1021
+ self.num_experts = config.num_experts
1022
+ self.hidden_dim = config.hidden_size
1023
+ self.weight = nn.Parameter(torch.zeros(self.num_experts, self.hidden_dim))
1024
+
1025
+ def forward(self, hidden_states):
1026
+ hidden_states = hidden_states.reshape(-1, self.hidden_dim)
1027
+ router_logits = F.linear(hidden_states, self.weight) # (seq_len, num_experts)
1028
+ router_logits = torch.nn.functional.softmax(router_logits, dtype=torch.float, dim=-1)
1029
+ router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1) # (seq_len, top_k)
1030
+ router_top_value /= router_top_value.sum(dim=-1, keepdim=True)
1031
+ router_top_value = router_top_value.to(router_logits.dtype)
1032
+ router_scores = router_top_value
1033
+ return router_logits, router_scores, router_indices
1034
+
1035
+
999
1036
  class Qwen3OmniMoeVisionMLP(nn.Module):
1000
1037
  def __init__(self, config):
1001
1038
  super().__init__()
@@ -1009,26 +1046,6 @@ class Qwen3OmniMoeVisionMLP(nn.Module):
1009
1046
  return self.linear_fc2(self.act_fn(self.linear_fc1(hidden_state)))
1010
1047
 
1011
1048
 
1012
- class Qwen3OmniMoeVisionPatchEmbed(nn.Module):
1013
- def __init__(self, config) -> None:
1014
- super().__init__()
1015
- self.patch_size = config.patch_size
1016
- self.temporal_patch_size = config.temporal_patch_size
1017
- self.in_channels = config.in_channels
1018
- self.embed_dim = config.hidden_size
1019
-
1020
- kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size]
1021
- self.proj = nn.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=True)
1022
-
1023
- def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
1024
- target_dtype = self.proj.weight.dtype
1025
- hidden_states = hidden_states.view(
1026
- -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
1027
- )
1028
- hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
1029
- return hidden_states
1030
-
1031
-
1032
1049
  class Qwen3OmniMoeVisionBlock(GradientCheckpointingLayer):
1033
1050
  def __init__(self, config, attn_implementation: str = "sdpa") -> None:
1034
1051
  super().__init__()
@@ -1056,9 +1073,34 @@ class Qwen3OmniMoeVisionBlock(GradientCheckpointingLayer):
1056
1073
  return hidden_states
1057
1074
 
1058
1075
 
1076
+ class Qwen3OmniMoeVisionPatchEmbed(nn.Module):
1077
+ def __init__(self, config) -> None:
1078
+ super().__init__()
1079
+ self.patch_size = config.patch_size
1080
+ self.temporal_patch_size = config.temporal_patch_size
1081
+ self.in_channels = config.in_channels
1082
+ self.embed_dim = config.hidden_size
1083
+
1084
+ kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size]
1085
+ self.proj = nn.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=True)
1086
+
1087
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
1088
+ target_dtype = self.proj.weight.dtype
1089
+ hidden_states = hidden_states.view(
1090
+ -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
1091
+ )
1092
+ hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
1093
+ return hidden_states
1094
+
1095
+
1059
1096
  class Qwen3OmniMoeVisionEncoder(Qwen3OmniMoePreTrainedModel):
1060
1097
  config: Qwen3OmniMoeVisionEncoderConfig
1061
1098
  _no_split_modules = ["Qwen3OmniMoeVisionBlock"]
1099
+ _can_record_outputs = {
1100
+ "router_logits": OutputRecorder(Qwen3OmniMoeTextTopKRouter, layer_name="mlp.gate", index=0),
1101
+ "hidden_states": Qwen3OmniMoeVisionBlock,
1102
+ "attentions": Qwen3OmniMoeVisionAttention,
1103
+ }
1062
1104
 
1063
1105
  def __init__(self, config, *inputs, **kwargs) -> None:
1064
1106
  super().__init__(config, *inputs, **kwargs)
@@ -1196,7 +1238,10 @@ class Qwen3OmniMoeVisionEncoder(Qwen3OmniMoePreTrainedModel):
1196
1238
  patch_pos_embeds = torch.cat(patch_pos_embeds_permute)
1197
1239
  return patch_pos_embeds
1198
1240
 
1199
- def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
1241
+ @check_model_inputs
1242
+ def forward(
1243
+ self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
1244
+ ) -> tuple | BaseModelOutputWithDeepstackFeatures:
1200
1245
  """
1201
1246
  Args:
1202
1247
  hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
@@ -1244,9 +1289,13 @@ class Qwen3OmniMoeVisionEncoder(Qwen3OmniMoePreTrainedModel):
1244
1289
  )
1245
1290
  deepstack_feature_lists.append(deepstack_feature)
1246
1291
 
1247
- hidden_states = self.merger(hidden_states)
1292
+ merged_hidden_states = self.merger(hidden_states)
1248
1293
 
1249
- return hidden_states, deepstack_feature_lists
1294
+ return BaseModelOutputWithDeepstackFeatures(
1295
+ last_hidden_state=hidden_states,
1296
+ pooler_output=merged_hidden_states,
1297
+ deepstack_features=deepstack_feature_lists,
1298
+ )
1250
1299
 
1251
1300
  @property
1252
1301
  def deepstack_merger_list(self):
@@ -1524,9 +1573,9 @@ class Qwen3OmniMoeThinkerTextAttention(nn.Module):
1524
1573
  cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
1525
1574
  key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
1526
1575
 
1527
- attention_interface: Callable = eager_attention_forward
1528
- if self.config._attn_implementation != "eager":
1529
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
1576
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
1577
+ self.config._attn_implementation, eager_attention_forward
1578
+ )
1530
1579
 
1531
1580
  attn_output, attn_weights = attention_interface(
1532
1581
  self,
@@ -1920,7 +1969,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
1920
1969
  self.vocab_size = config.text_config.vocab_size
1921
1970
  self.model = Qwen3OmniMoeThinkerTextModel._from_config(config.text_config)
1922
1971
  self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
1923
- self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
1924
1972
  self.spatial_merge_size = config.vision_config.spatial_merge_size
1925
1973
  self.rope_deltas = None
1926
1974
  self.num_experts = config.text_config.num_experts
@@ -1934,52 +1982,56 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
1934
1982
  def set_input_embeddings(self, value):
1935
1983
  self.model.set_input_embeddings(value)
1936
1984
 
1985
+ @can_return_tuple
1986
+ @auto_docstring
1937
1987
  def get_video_features(
1938
- self, pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None
1939
- ):
1940
- """
1941
- Encodes videos into continuous embeddings that can be forwarded to the language model.
1942
-
1943
- Args:
1944
- pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1945
- The tensors corresponding to the input videos.
1946
- video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1947
- The temporal, height and width of feature shape of each video in LLM.
1988
+ self,
1989
+ pixel_values_videos: torch.FloatTensor,
1990
+ video_grid_thw: torch.LongTensor | None = None,
1991
+ **kwargs: Unpack[TransformersKwargs],
1992
+ ) -> tuple | BaseModelOutputWithDeepstackFeatures:
1993
+ r"""
1994
+ pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1995
+ The tensors corresponding to the input videos.
1996
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1997
+ The temporal, height and width of feature shape of each video in LLM.
1948
1998
  """
1949
1999
  pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
1950
- video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
1951
- return video_embeds
1952
-
1953
- def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None):
1954
- """
1955
- Encodes images into continuous embeddings that can be forwarded to the language model.
2000
+ return self.visual(pixel_values_videos, grid_thw=video_grid_thw, **kwargs)
1956
2001
 
1957
- Args:
1958
- pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1959
- The tensors corresponding to the input images.
1960
- image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1961
- The temporal, height and width of feature shape of each image in LLM.
2002
+ @can_return_tuple
2003
+ @auto_docstring
2004
+ def get_image_features(
2005
+ self,
2006
+ pixel_values: torch.FloatTensor,
2007
+ image_grid_thw: torch.LongTensor | None = None,
2008
+ **kwargs: Unpack[TransformersKwargs],
2009
+ ) -> tuple | BaseModelOutputWithDeepstackFeatures:
2010
+ r"""
2011
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
2012
+ The tensors corresponding to the input images.
2013
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
2014
+ The temporal, height and width of feature shape of each image in LLM.
1962
2015
  """
1963
2016
  pixel_values = pixel_values.type(self.visual.dtype)
1964
- image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
1965
- return image_embeds
2017
+ return self.visual(pixel_values, grid_thw=image_grid_thw, **kwargs)
1966
2018
 
2019
+ @can_return_tuple
2020
+ @auto_docstring
1967
2021
  def get_audio_features(
1968
2022
  self,
1969
2023
  input_features: torch.FloatTensor,
1970
2024
  feature_attention_mask: torch.LongTensor | None = None,
1971
2025
  audio_feature_lengths: torch.LongTensor | None = None,
1972
- ):
1973
- """
1974
- Encodes audios into continuous embeddings that can be forwarded to the language model.
1975
-
1976
- Args:
1977
- input_features (`torch.FloatTensor`):
1978
- The tensors corresponding to the input audios.
1979
- feature_attention_mask (`torch.LongTensor`, *optional*):
1980
- Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
1981
- audio_feature_lengths (`torch.LongTensor` of shape `(num_audios)`, *optional*):
1982
- The length of feature shape of each audio in LLM.
2026
+ **kwargs: Unpack[TransformersKwargs],
2027
+ ) -> tuple | BaseModelOutputWithPooling:
2028
+ r"""
2029
+ input_features (`torch.FloatTensor`):
2030
+ The tensors corresponding to the input audios.
2031
+ feature_attention_mask (`torch.LongTensor`, *optional*):
2032
+ Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
2033
+ audio_feature_lengths (`torch.LongTensor` of shape `(num_audios)`, *optional*):
2034
+ The length of feature shape of each audio in LLM.
1983
2035
  """
1984
2036
  if feature_attention_mask is not None:
1985
2037
  audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
@@ -1991,10 +2043,11 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
1991
2043
  audio_outputs = self.audio_tower(
1992
2044
  input_features,
1993
2045
  feature_lens=feature_lens,
2046
+ return_dict=True,
2047
+ **kwargs,
1994
2048
  )
1995
- audio_features = audio_outputs.last_hidden_state
1996
2049
 
1997
- return audio_features
2050
+ return audio_outputs
1998
2051
 
1999
2052
  def get_placeholder_mask(
2000
2053
  self,
@@ -2029,16 +2082,18 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
2029
2082
 
2030
2083
  n_image_tokens = special_image_mask.sum()
2031
2084
  special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
2032
- if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
2033
- raise ValueError(
2034
- f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
2085
+ if image_features is not None:
2086
+ torch_compilable_check(
2087
+ inputs_embeds[special_image_mask].numel() == image_features.numel(),
2088
+ f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
2035
2089
  )
2036
2090
 
2037
2091
  n_video_tokens = special_video_mask.sum()
2038
2092
  special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
2039
- if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
2040
- raise ValueError(
2041
- f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}"
2093
+ if video_features is not None:
2094
+ torch_compilable_check(
2095
+ inputs_embeds[special_video_mask].numel() == video_features.numel(),
2096
+ f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
2042
2097
  )
2043
2098
 
2044
2099
  special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
@@ -2141,13 +2196,18 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
2141
2196
  input_features,
2142
2197
  feature_attention_mask=feature_attention_mask,
2143
2198
  audio_feature_lengths=audio_feature_lengths,
2144
- )
2199
+ return_dict=True,
2200
+ ).last_hidden_state
2145
2201
  audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
2146
2202
  _, _, audio_mask = self.get_placeholder_mask(input_ids, inputs_embeds=inputs_embeds)
2147
2203
  inputs_embeds = inputs_embeds.masked_scatter(audio_mask, audio_features)
2148
2204
 
2149
2205
  if pixel_values is not None:
2150
- image_embeds, image_embeds_multiscale = self.get_image_features(pixel_values, image_grid_thw)
2206
+ image_outputs: BaseModelOutputWithDeepstackFeatures = self.get_image_features(
2207
+ pixel_values, image_grid_thw, return_dict=True
2208
+ )
2209
+ image_embeds = image_outputs.pooler_output
2210
+ image_embeds_multiscale = image_outputs.deepstack_features
2151
2211
  image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
2152
2212
  image_mask, _, _ = self.get_placeholder_mask(
2153
2213
  input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
@@ -2155,8 +2215,11 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
2155
2215
  inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
2156
2216
 
2157
2217
  if pixel_values_videos is not None:
2158
- video_embeds, video_embeds_multiscale = self.get_video_features(pixel_values_videos, video_grid_thw)
2159
-
2218
+ video_outputs: BaseModelOutputWithDeepstackFeatures = self.get_video_features(
2219
+ pixel_values_videos, video_grid_thw, return_dict=True
2220
+ )
2221
+ video_embeds = video_outputs.pooler_output
2222
+ video_embeds_multiscale = video_outputs.deepstack_features
2160
2223
  video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
2161
2224
  _, video_mask, _ = self.get_placeholder_mask(
2162
2225
  input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
@@ -2405,9 +2468,9 @@ class Qwen3OmniMoeTalkerCodePredictorAttention(nn.Module):
2405
2468
  cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
2406
2469
  key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
2407
2470
 
2408
- attention_interface: Callable = eager_attention_forward
2409
- if self.config._attn_implementation != "eager":
2410
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
2471
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
2472
+ self.config._attn_implementation, eager_attention_forward
2473
+ )
2411
2474
 
2412
2475
  attn_output, attn_weights = attention_interface(
2413
2476
  self,
@@ -2656,7 +2719,7 @@ class Qwen3OmniMoeTalkerCodePredictorModel(Qwen3OmniMoePreTrainedModel):
2656
2719
  @auto_docstring
2657
2720
  class Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration(Qwen3OmniMoePreTrainedModel, GenerationMixin):
2658
2721
  _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
2659
- _tp_plan = {"lm_head": "colwise_rep"}
2722
+ _tp_plan = {"lm_head": "colwise_gather_output"}
2660
2723
  _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
2661
2724
  config_class = Qwen3OmniMoeTalkerCodePredictorConfig
2662
2725
  base_model_prefix = "talker.code_predictor"
@@ -3049,7 +3112,7 @@ class Qwen3OmniMoeTalkerModel(Qwen3OmniMoePreTrainedModel):
3049
3112
  @auto_docstring
3050
3113
  class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3OmniMoeThinkerTextPreTrainedModel, GenerationMixin):
3051
3114
  _tied_weights_keys = {"codec_head": "model.codec_embedding.weight"}
3052
- _tp_plan = {"codec_head": "colwise_rep"}
3115
+ _tp_plan = {"codec_head": "colwise_gather_output"}
3053
3116
  _pp_plan = {"codec_head": (["hidden_states"], ["logits"])}
3054
3117
  config_class = Qwen3OmniMoeTalkerConfig
3055
3118
  base_model_prefix = "talker"
@@ -3446,9 +3509,9 @@ class Qwen3OmniMoeCode2WavAttention(nn.Module):
3446
3509
  cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
3447
3510
  key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
3448
3511
 
3449
- attention_interface: Callable = eager_attention_forward
3450
- if self.config._attn_implementation != "eager":
3451
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
3512
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
3513
+ self.config._attn_implementation, eager_attention_forward
3514
+ )
3452
3515
 
3453
3516
  attn_output, attn_weights = attention_interface(
3454
3517
  self,