transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -35,7 +35,12 @@ from ... import initialization as init
35
35
  from ...activations import ACT2FN
36
36
  from ...cache_utils import Cache, DynamicCache
37
37
  from ...generation import GenerationMixin
38
- from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
38
+ from ...integrations import (
39
+ use_experts_implementation,
40
+ use_kernel_forward_from_hub,
41
+ use_kernel_func_from_hub,
42
+ use_kernelized_func,
43
+ )
39
44
  from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
40
45
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
41
46
  from ...modeling_layers import GradientCheckpointingLayer
@@ -49,8 +54,8 @@ from ...modeling_outputs import (
49
54
  from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
50
55
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
51
56
  from ...processing_utils import Unpack
52
- from ...utils import auto_docstring, can_return_tuple
53
- from ...utils.generic import OutputRecorder, TransformersKwargs, check_model_inputs
57
+ from ...utils import auto_docstring, can_return_tuple, is_grouped_mm_available
58
+ from ...utils.generic import OutputRecorder, TransformersKwargs, check_model_inputs, maybe_autocast
54
59
  from .configuration_qwen3_omni_moe import (
55
60
  Qwen3OmniMoeAudioEncoderConfig,
56
61
  Qwen3OmniMoeCode2WavConfig,
@@ -64,6 +69,27 @@ from .configuration_qwen3_omni_moe import (
64
69
  )
65
70
 
66
71
 
72
+ class SinusoidsPositionEmbedding(nn.Module):
73
+ def __init__(self, length, channels, max_timescale=10000):
74
+ super().__init__()
75
+ self.length = length
76
+ self.channels = channels
77
+ self.max_timescale = max_timescale
78
+ if channels % 2 != 0:
79
+ raise ValueError("SinusoidsPositionEmbedding needs even channels input")
80
+ log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
81
+ inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
82
+ scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
83
+ self.register_buffer(
84
+ "positional_embedding",
85
+ torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1),
86
+ persistent=False,
87
+ )
88
+
89
+ def forward(self, seqlen: int):
90
+ return self.positional_embedding[:seqlen, :]
91
+
92
+
67
93
  @auto_docstring
68
94
  class Qwen3OmniMoePreTrainedModel(PreTrainedModel):
69
95
  config: Qwen3OmniMoeConfig
@@ -85,6 +111,19 @@ class Qwen3OmniMoePreTrainedModel(PreTrainedModel):
85
111
  init.normal_(module.experts.gate_up_proj, mean=0.0, std=std)
86
112
  init.normal_(module.experts.down_proj, mean=0.0, std=std)
87
113
  init.normal_(module.gate.weight, mean=0.0, std=std)
114
+ elif isinstance(module, Qwen3OmniMoeCode2Wav):
115
+ init.copy_(
116
+ module.code_offset,
117
+ torch.arange(module.config.num_quantizers).view(1, -1, 1) * module.config.codebook_size,
118
+ )
119
+ elif isinstance(module, SinusoidsPositionEmbedding):
120
+ log_timescale_increment = np.log(module.max_timescale) / (module.channels // 2 - 1)
121
+ inv_timescales = torch.exp(-log_timescale_increment * torch.arange(module.channels // 2).float())
122
+ scaled_time = torch.arange(module.length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
123
+ init.copy_(module.positional_embedding, torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1))
124
+ elif isinstance(module, Qwen3OmniMoeVisionRotaryEmbedding):
125
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
126
+ init.copy_(module.inv_freq, inv_freq)
88
127
 
89
128
 
90
129
  def _get_feat_extract_output_lengths(input_lengths):
@@ -620,24 +659,6 @@ class Qwen3OmniMoeAudioEncoderLayer(GradientCheckpointingLayer):
620
659
  return outputs
621
660
 
622
661
 
623
- class SinusoidsPositionEmbedding(nn.Module):
624
- def __init__(self, length, channels, max_timescale=10000):
625
- super().__init__()
626
- if channels % 2 != 0:
627
- raise ValueError("SinusoidsPositionEmbedding needs even channels input")
628
- log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
629
- inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
630
- scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
631
- self.register_buffer(
632
- "positional_embedding",
633
- torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1),
634
- persistent=False,
635
- )
636
-
637
- def forward(self, seqlen: int):
638
- return self.positional_embedding[:seqlen, :]
639
-
640
-
641
662
  @auto_docstring(
642
663
  custom_intro="""
643
664
  Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
@@ -716,6 +737,7 @@ class Qwen3OmniMoeAudioEncoder(Qwen3OmniMoePreTrainedModel):
716
737
  input_features,
717
738
  feature_lens=None,
718
739
  aftercnn_lens=None,
740
+ **kwargs,
719
741
  ):
720
742
  r"""
721
743
  feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
@@ -890,8 +912,8 @@ class Qwen3OmniMoeVisionAttention(nn.Module):
890
912
  if self.config._attn_implementation != "eager":
891
913
  attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
892
914
 
893
- if self.config._attn_implementation == "flash_attention_2":
894
- # Flash Attention 2: Use cu_seqlens for variable length attention
915
+ if "flash" in self.config._attn_implementation:
916
+ # Flash Attention: Use cu_seqlens for variable length attention
895
917
  max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
896
918
  attn_output, _ = attention_interface(
897
919
  self,
@@ -959,6 +981,22 @@ class Qwen3OmniMoeVisionPatchMerger(nn.Module):
959
981
  return hidden
960
982
 
961
983
 
984
+ class Qwen3OmniMoeVisionRotaryEmbedding(nn.Module):
985
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
986
+
987
+ def __init__(self, dim: int, theta: float = 10000.0) -> None:
988
+ super().__init__()
989
+ self.dim = dim
990
+ self.theta = theta
991
+ inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
992
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
993
+
994
+ def forward(self, seqlen: int) -> torch.Tensor:
995
+ seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
996
+ freqs = torch.outer(seq, self.inv_freq)
997
+ return freqs
998
+
999
+
962
1000
  class Qwen3OmniMoeVisionMLP(nn.Module):
963
1001
  def __init__(self, config):
964
1002
  super().__init__()
@@ -992,20 +1030,6 @@ class Qwen3OmniMoeVisionPatchEmbed(nn.Module):
992
1030
  return hidden_states
993
1031
 
994
1032
 
995
- class Qwen3OmniMoeVisionRotaryEmbedding(nn.Module):
996
- inv_freq: torch.Tensor # fix linting for `register_buffer`
997
-
998
- def __init__(self, dim: int, theta: float = 10000.0) -> None:
999
- super().__init__()
1000
- inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
1001
- self.register_buffer("inv_freq", inv_freq, persistent=False)
1002
-
1003
- def forward(self, seqlen: int) -> torch.Tensor:
1004
- seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
1005
- freqs = torch.outer(seq, self.inv_freq)
1006
- return freqs
1007
-
1008
-
1009
1033
  class Qwen3OmniMoeVisionBlock(GradientCheckpointingLayer):
1010
1034
  def __init__(self, config, attn_implementation: str = "sdpa") -> None:
1011
1035
  super().__init__()
@@ -1072,6 +1096,8 @@ class Qwen3OmniMoeVisionEncoder(Qwen3OmniMoePreTrainedModel):
1072
1096
 
1073
1097
  self.gradient_checkpointing = False
1074
1098
 
1099
+ self.post_init()
1100
+
1075
1101
  def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
1076
1102
  merge_size = self.spatial_merge_size
1077
1103
 
@@ -1245,7 +1271,7 @@ class Qwen3OmniMoeThinkerTextRotaryEmbedding(nn.Module):
1245
1271
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
1246
1272
 
1247
1273
  self.register_buffer("inv_freq", inv_freq, persistent=False)
1248
- self.original_inv_freq = inv_freq
1274
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
1249
1275
 
1250
1276
  self.mrope_section = config.rope_parameters.get("mrope_section", [24, 20, 20])
1251
1277
 
@@ -1290,7 +1316,7 @@ class Qwen3OmniMoeThinkerTextRotaryEmbedding(nn.Module):
1290
1316
  position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
1291
1317
 
1292
1318
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
1293
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
1319
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
1294
1320
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
1295
1321
  freqs = self.apply_interleaved_mrope(freqs, self.mrope_section)
1296
1322
  emb = torch.cat((freqs, freqs), dim=-1)
@@ -1317,6 +1343,7 @@ class Qwen3OmniMoeThinkerTextRotaryEmbedding(nn.Module):
1317
1343
  return freqs_t
1318
1344
 
1319
1345
 
1346
+ @use_experts_implementation
1320
1347
  class Qwen3OmniMoeThinkerTextExperts(nn.Module):
1321
1348
  """
1322
1349
  ModuleList of experts.
@@ -1442,6 +1469,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
1442
1469
  return q_embed, k_embed
1443
1470
 
1444
1471
 
1472
+ @use_kernelized_func(apply_rotary_pos_emb)
1445
1473
  class Qwen3OmniMoeThinkerTextAttention(nn.Module):
1446
1474
  """Multi-headed attention from 'Attention Is All You Need' paper"""
1447
1475
 
@@ -1467,7 +1495,6 @@ class Qwen3OmniMoeThinkerTextAttention(nn.Module):
1467
1495
  self.o_proj = nn.Linear(
1468
1496
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
1469
1497
  )
1470
- self.rotary_fn = apply_rotary_pos_emb
1471
1498
  self.q_norm = Qwen3OmniMoeThinkerTextRMSNorm(
1472
1499
  self.head_dim, eps=config.rms_norm_eps
1473
1500
  ) # unlike olmo, only on the head dim!
@@ -1595,7 +1622,9 @@ class Qwen3OmniMoeThinkerTextPreTrainedModel(PreTrainedModel):
1595
1622
  _supports_flash_attn = True
1596
1623
  _supports_sdpa = True
1597
1624
  _supports_flex_attn = True
1598
- _can_compile_fullgraph = False # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
1625
+ _can_compile_fullgraph = (
1626
+ is_grouped_mm_available()
1627
+ ) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
1599
1628
  _supports_attention_backend = True
1600
1629
  _can_record_outputs = {
1601
1630
  "router_logits": OutputRecorder(Qwen3OmniMoeThinkerTextTopKRouter, layer_name="mlp.gate", index=0),
@@ -2165,11 +2194,8 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
2165
2194
  audio_feature_lengths = None
2166
2195
 
2167
2196
  if attention_mask is not None and position_ids is None:
2168
- if (
2169
- cache_position is None
2170
- or (cache_position is not None and cache_position[0] == 0)
2171
- or self.rope_deltas is None
2172
- ):
2197
+ past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
2198
+ if past_key_values_length == 0 or self.rope_deltas is None:
2173
2199
  delta0 = (1 - attention_mask).sum(dim=-1).unsqueeze(1)
2174
2200
  position_ids, rope_deltas = self.get_rope_index(
2175
2201
  input_ids,
@@ -2184,7 +2210,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
2184
2210
  self.rope_deltas = rope_deltas
2185
2211
  else:
2186
2212
  batch_size, seq_length = input_ids.shape
2187
- delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0
2213
+ delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
2188
2214
  position_ids = torch.arange(seq_length, device=input_ids.device)
2189
2215
  position_ids = position_ids.view(1, -1).expand(batch_size, -1)
2190
2216
  position_ids = position_ids.add(delta)
@@ -2250,6 +2276,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
2250
2276
  feature_attention_mask=None,
2251
2277
  use_audio_in_video=False,
2252
2278
  video_second_per_grid=None,
2279
+ is_first_iteration=False,
2253
2280
  **kwargs,
2254
2281
  ):
2255
2282
  model_inputs = super().prepare_inputs_for_generation(
@@ -2268,12 +2295,13 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
2268
2295
  feature_attention_mask=feature_attention_mask,
2269
2296
  use_audio_in_video=use_audio_in_video,
2270
2297
  video_second_per_grid=video_second_per_grid,
2298
+ is_first_iteration=is_first_iteration,
2271
2299
  **kwargs,
2272
2300
  )
2273
2301
 
2274
2302
  model_inputs["position_ids"] = None
2275
2303
 
2276
- if cache_position[0] != 0:
2304
+ if not is_first_iteration and use_cache:
2277
2305
  model_inputs["pixel_values"] = None
2278
2306
  model_inputs["pixel_values_videos"] = None
2279
2307
  model_inputs["input_features"] = None
@@ -2323,6 +2351,7 @@ class Qwen3OmniMoeRMSNorm(nn.Module):
2323
2351
  return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
2324
2352
 
2325
2353
 
2354
+ @use_kernelized_func(apply_rotary_pos_emb)
2326
2355
  class Qwen3OmniMoeTalkerCodePredictorAttention(nn.Module):
2327
2356
  """Multi-headed attention from 'Attention Is All You Need' paper"""
2328
2357
 
@@ -2349,7 +2378,6 @@ class Qwen3OmniMoeTalkerCodePredictorAttention(nn.Module):
2349
2378
  self.o_proj = nn.Linear(
2350
2379
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
2351
2380
  )
2352
- self.rotary_fn = apply_rotary_pos_emb
2353
2381
  self.q_norm = Qwen3OmniMoeRMSNorm(self.head_dim, eps=config.rms_norm_eps) # unlike olmo, only on the head dim!
2354
2382
  self.k_norm = Qwen3OmniMoeRMSNorm(
2355
2383
  self.head_dim, eps=config.rms_norm_eps
@@ -2479,7 +2507,7 @@ class Qwen3OmniMoeRotaryEmbedding(nn.Module):
2479
2507
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
2480
2508
 
2481
2509
  self.register_buffer("inv_freq", inv_freq, persistent=False)
2482
- self.original_inv_freq = inv_freq
2510
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
2483
2511
 
2484
2512
  @staticmethod
2485
2513
  def compute_default_rope_parameters(
@@ -2518,7 +2546,7 @@ class Qwen3OmniMoeRotaryEmbedding(nn.Module):
2518
2546
  position_ids_expanded = position_ids[:, None, :].float()
2519
2547
 
2520
2548
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
2521
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
2549
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
2522
2550
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
2523
2551
  emb = torch.cat((freqs, freqs), dim=-1)
2524
2552
  cos = emb.cos() * self.attention_scaling
@@ -2747,6 +2775,7 @@ class Qwen3OmniMoeTalkerTextMLP(nn.Module):
2747
2775
  return down_proj
2748
2776
 
2749
2777
 
2778
+ @use_experts_implementation
2750
2779
  class Qwen3OmniMoeTalkerTextExperts(nn.Module):
2751
2780
  """Collection of expert weights stored as 3D tensors."""
2752
2781
 
@@ -3022,9 +3051,9 @@ class Qwen3OmniMoeTalkerModel(Qwen3OmniMoePreTrainedModel):
3022
3051
 
3023
3052
  @auto_docstring
3024
3053
  class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3OmniMoeThinkerTextPreTrainedModel, GenerationMixin):
3025
- _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
3026
- _tp_plan = {"lm_head": "colwise_rep"}
3027
- _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
3054
+ _tied_weights_keys = {"codec_head": "model.codec_embedding.weight"}
3055
+ _tp_plan = {"codec_head": "colwise_rep"}
3056
+ _pp_plan = {"codec_head": (["hidden_states"], ["logits"])}
3028
3057
  config_class = Qwen3OmniMoeTalkerConfig
3029
3058
  base_model_prefix = "talker"
3030
3059
  _no_split_modules = ["Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration"]
@@ -3103,12 +3132,9 @@ class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3OmniMoeThinkerTextPreTrain
3103
3132
  if inputs_embeds is not None and inputs_embeds.shape[1] > 1:
3104
3133
  generation_step = -1
3105
3134
  residual_codes = None
3106
- if attention_mask is not None:
3107
- if (
3108
- cache_position is None
3109
- or (cache_position is not None and cache_position[0] == 0)
3110
- or self.rope_deltas is None
3111
- ):
3135
+ if position_ids is None:
3136
+ past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
3137
+ if past_key_values_length == 0 or self.rope_deltas is None:
3112
3138
  delta0 = (1 - attention_mask).sum(dim=-1).unsqueeze(1)
3113
3139
  position_ids, rope_deltas = self.get_rope_index(
3114
3140
  talker_input_ids,
@@ -3123,7 +3149,7 @@ class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3OmniMoeThinkerTextPreTrain
3123
3149
  self.rope_deltas = rope_deltas
3124
3150
  else:
3125
3151
  batch_size, seq_length = input_ids.shape
3126
- delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0
3152
+ delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
3127
3153
  position_ids = torch.arange(seq_length, device=input_ids.device)
3128
3154
  position_ids = position_ids.view(1, -1).expand(batch_size, -1)
3129
3155
  position_ids = position_ids.add(delta)
@@ -3218,15 +3244,31 @@ class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3OmniMoeThinkerTextPreTrain
3218
3244
  return model_kwargs
3219
3245
 
3220
3246
  def prepare_inputs_for_generation(
3221
- self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
3247
+ self,
3248
+ input_ids,
3249
+ past_key_values=None,
3250
+ attention_mask=None,
3251
+ inputs_embeds=None,
3252
+ cache_position=None,
3253
+ is_first_iteration=False,
3254
+ **kwargs,
3222
3255
  ):
3223
3256
  hidden_states = kwargs.pop("hidden_states", None)
3224
3257
  inputs = super().prepare_inputs_for_generation(
3225
- input_ids, past_key_values, attention_mask, inputs_embeds, cache_position, **kwargs
3258
+ input_ids,
3259
+ past_key_values,
3260
+ attention_mask,
3261
+ inputs_embeds,
3262
+ cache_position,
3263
+ is_first_iteration=is_first_iteration,
3264
+ **kwargs,
3226
3265
  )
3227
- # Decode stage
3266
+
3267
+ # Qwen3-Omni will prepare position ids in forward with deltas
3268
+ inputs["position_ids"] = None
3269
+
3228
3270
  # TODO(raushan, gante): Refactor this part to a utility function
3229
- if cache_position[0] != 0:
3271
+ if not is_first_iteration and kwargs.get("use_cache", True):
3230
3272
  input_ids = input_ids[:, -1:]
3231
3273
  generation_step = kwargs.get("generation_step")
3232
3274
  trailing_text_hidden = kwargs.get("trailing_text_hidden")
@@ -3352,6 +3394,7 @@ class Qwen3OmniMoeConvNeXtBlock(nn.Module):
3352
3394
  return hidden_states
3353
3395
 
3354
3396
 
3397
+ @use_kernelized_func(apply_rotary_pos_emb)
3355
3398
  class Qwen3OmniMoeCode2WavAttention(nn.Module):
3356
3399
  """Multi-headed attention from 'Attention Is All You Need' paper"""
3357
3400
 
@@ -3378,7 +3421,6 @@ class Qwen3OmniMoeCode2WavAttention(nn.Module):
3378
3421
  self.o_proj = nn.Linear(
3379
3422
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
3380
3423
  )
3381
- self.rotary_fn = apply_rotary_pos_emb
3382
3424
  self.q_norm = nn.Identity()
3383
3425
  self.k_norm = nn.Identity()
3384
3426
  self.sliding_window = config.sliding_window
@@ -3718,7 +3760,9 @@ class Qwen3OmniMoeCode2WavDecoderBlock(Qwen3OmniMoePreTrainedModel):
3718
3760
 
3719
3761
  self.block = nn.ModuleList(block)
3720
3762
 
3721
- def forward(self, hidden):
3763
+ self.post_init()
3764
+
3765
+ def forward(self, hidden, **kwargs):
3722
3766
  for block in self.block:
3723
3767
  hidden = block(hidden)
3724
3768
  return hidden
@@ -3760,7 +3804,7 @@ class Qwen3OmniMoeCode2Wav(Qwen3OmniMoePreTrainedModel):
3760
3804
 
3761
3805
  self.post_init()
3762
3806
 
3763
- def forward(self, codes):
3807
+ def forward(self, codes, **kwargs):
3764
3808
  if codes.shape[1] != self.config.num_quantizers:
3765
3809
  raise ValueError(f"Expected {self.config.num_quantizers} layer of codes, got {codes.shape[1]}")
3766
3810
  hidden = self.code_embedding(codes + self.code_offset).mean(1)
@@ -62,7 +62,11 @@ from ..qwen2_5_omni.modeling_qwen2_5_omni import (
62
62
  Qwen2_5OmniThinkerForConditionalGeneration,
63
63
  SnakeBeta,
64
64
  )
65
- from ..qwen2_5_omni.processing_qwen2_5_omni import Qwen2_5OmniProcessor, Qwen2_5OmniProcessorKwargs
65
+ from ..qwen2_5_omni.processing_qwen2_5_omni import (
66
+ Qwen2_5OmniProcessor,
67
+ Qwen2_5OmniProcessorKwargs,
68
+ SinusoidsPositionEmbedding,
69
+ )
66
70
  from ..qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
67
71
  from ..qwen3.configuration_qwen3 import Qwen3Config
68
72
  from ..qwen3.modeling_qwen3 import (
@@ -91,6 +95,7 @@ from ..qwen3_vl_moe.modeling_qwen3_vl_moe import (
91
95
  Qwen3VLMoeTextRotaryEmbedding,
92
96
  Qwen3VLMoeVisionAttention,
93
97
  Qwen3VLMoeVisionModel,
98
+ Qwen3VLMoeVisionRotaryEmbedding,
94
99
  )
95
100
 
96
101
 
@@ -668,6 +673,7 @@ class Qwen3OmniMoeTalkerConfig(PreTrainedConfig):
668
673
  self.audio_start_token_id = audio_start_token_id
669
674
  self.vision_start_token_id = vision_start_token_id
670
675
  self.speaker_id = speaker_id
676
+ self.initializer_range = self.text_config.initializer_range
671
677
  super().__init__(**kwargs)
672
678
 
673
679
 
@@ -758,6 +764,7 @@ class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig):
758
764
  upsampling_ratios=(2, 2),
759
765
  decoder_dim=1536,
760
766
  attention_dropout=0.0,
767
+ initializer_range=0.02,
761
768
  **kwargs,
762
769
  ):
763
770
  self.codebook_size = codebook_size
@@ -777,6 +784,7 @@ class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig):
777
784
  self.upsampling_ratios = upsampling_ratios
778
785
  self.decoder_dim = decoder_dim
779
786
  self.attention_dropout = attention_dropout
787
+ self.initializer_range = initializer_range
780
788
  self.rope_parameters = rope_parameters
781
789
 
782
790
  super().__init__(**kwargs)
@@ -865,6 +873,7 @@ class Qwen3OmniMoeConfig(PreTrainedConfig):
865
873
  self.thinker_config = Qwen3OmniMoeThinkerConfig(**thinker_config)
866
874
  self.talker_config = Qwen3OmniMoeTalkerConfig(**talker_config)
867
875
  self.code2wav_config = Qwen3OmniMoeCode2WavConfig(**code2wav_config)
876
+ self.initializer_range = self.thinker_config.initializer_range
868
877
  self.enable_audio_output = enable_audio_output
869
878
  self.im_start_token_id = im_start_token_id
870
879
  self.im_end_token_id = im_end_token_id
@@ -900,6 +909,19 @@ class Qwen3OmniMoePreTrainedModel(Qwen2_5OmniPreTrainedModel, PreTrainedModel):
900
909
  init.normal_(module.experts.gate_up_proj, mean=0.0, std=std)
901
910
  init.normal_(module.experts.down_proj, mean=0.0, std=std)
902
911
  init.normal_(module.gate.weight, mean=0.0, std=std)
912
+ elif isinstance(module, Qwen3OmniMoeCode2Wav):
913
+ init.copy_(
914
+ module.code_offset,
915
+ torch.arange(module.config.num_quantizers).view(1, -1, 1) * module.config.codebook_size,
916
+ )
917
+ elif isinstance(module, SinusoidsPositionEmbedding):
918
+ log_timescale_increment = np.log(module.max_timescale) / (module.channels // 2 - 1)
919
+ inv_timescales = torch.exp(-log_timescale_increment * torch.arange(module.channels // 2).float())
920
+ scaled_time = torch.arange(module.length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
921
+ init.copy_(module.positional_embedding, torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1))
922
+ elif isinstance(module, Qwen3OmniMoeVisionRotaryEmbedding):
923
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
924
+ init.copy_(module.inv_freq, inv_freq)
903
925
 
904
926
 
905
927
  class Qwen3OmniMoePreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedModelForConditionalGeneration):
@@ -1205,6 +1227,7 @@ class Qwen3OmniMoeAudioEncoder(Qwen2_5OmniAudioEncoder):
1205
1227
  input_features,
1206
1228
  feature_lens=None,
1207
1229
  aftercnn_lens=None,
1230
+ **kwargs,
1208
1231
  ):
1209
1232
  aftercnn_lens = _get_feat_extract_output_lengths(feature_lens)
1210
1233
  chunk_num = torch.ceil(feature_lens / (self.n_window * 2)).long()
@@ -1296,6 +1319,10 @@ class Qwen3OmniMoeVisionPatchMerger(nn.Module):
1296
1319
  return hidden
1297
1320
 
1298
1321
 
1322
+ class Qwen3OmniMoeVisionRotaryEmbedding(Qwen3VLMoeVisionRotaryEmbedding):
1323
+ pass
1324
+
1325
+
1299
1326
  class Qwen3OmniMoeVisionEncoder(Qwen3VLMoeVisionModel):
1300
1327
  config: Qwen3OmniMoeVisionEncoderConfig
1301
1328
  _no_split_modules = ["Qwen3OmniMoeVisionBlock"]
@@ -1521,11 +1548,8 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(Qwen2_5OmniThinkerForCondition
1521
1548
  audio_feature_lengths = None
1522
1549
 
1523
1550
  if attention_mask is not None and position_ids is None:
1524
- if (
1525
- cache_position is None
1526
- or (cache_position is not None and cache_position[0] == 0)
1527
- or self.rope_deltas is None
1528
- ):
1551
+ past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
1552
+ if past_key_values_length == 0 or self.rope_deltas is None:
1529
1553
  delta0 = (1 - attention_mask).sum(dim=-1).unsqueeze(1)
1530
1554
  position_ids, rope_deltas = self.get_rope_index(
1531
1555
  input_ids,
@@ -1540,7 +1564,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(Qwen2_5OmniThinkerForCondition
1540
1564
  self.rope_deltas = rope_deltas
1541
1565
  else:
1542
1566
  batch_size, seq_length = input_ids.shape
1543
- delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0
1567
+ delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
1544
1568
  position_ids = torch.arange(seq_length, device=input_ids.device)
1545
1569
  position_ids = position_ids.view(1, -1).expand(batch_size, -1)
1546
1570
  position_ids = position_ids.add(delta)
@@ -1849,6 +1873,9 @@ class Qwen3OmniMoeTalkerModel(Qwen3VLMoeTextModel):
1849
1873
 
1850
1874
 
1851
1875
  class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3MoeForCausalLM):
1876
+ _tied_weights_keys = {"codec_head": "model.codec_embedding.weight"}
1877
+ _tp_plan = {"codec_head": "colwise_rep"}
1878
+ _pp_plan = {"codec_head": (["hidden_states"], ["logits"])}
1852
1879
  config_class = Qwen3OmniMoeTalkerConfig
1853
1880
  base_model_prefix = "talker"
1854
1881
  _no_split_modules = ["Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration"]
@@ -1961,12 +1988,9 @@ class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3MoeForCausalLM):
1961
1988
  if inputs_embeds is not None and inputs_embeds.shape[1] > 1:
1962
1989
  generation_step = -1
1963
1990
  residual_codes = None
1964
- if attention_mask is not None:
1965
- if (
1966
- cache_position is None
1967
- or (cache_position is not None and cache_position[0] == 0)
1968
- or self.rope_deltas is None
1969
- ):
1991
+ if position_ids is None:
1992
+ past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
1993
+ if past_key_values_length == 0 or self.rope_deltas is None:
1970
1994
  delta0 = (1 - attention_mask).sum(dim=-1).unsqueeze(1)
1971
1995
  position_ids, rope_deltas = self.get_rope_index(
1972
1996
  talker_input_ids,
@@ -1981,7 +2005,7 @@ class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3MoeForCausalLM):
1981
2005
  self.rope_deltas = rope_deltas
1982
2006
  else:
1983
2007
  batch_size, seq_length = input_ids.shape
1984
- delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0
2008
+ delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
1985
2009
  position_ids = torch.arange(seq_length, device=input_ids.device)
1986
2010
  position_ids = position_ids.view(1, -1).expand(batch_size, -1)
1987
2011
  position_ids = position_ids.add(delta)
@@ -2038,15 +2062,31 @@ class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3MoeForCausalLM):
2038
2062
  return model_kwargs
2039
2063
 
2040
2064
  def prepare_inputs_for_generation(
2041
- self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
2065
+ self,
2066
+ input_ids,
2067
+ past_key_values=None,
2068
+ attention_mask=None,
2069
+ inputs_embeds=None,
2070
+ cache_position=None,
2071
+ is_first_iteration=False,
2072
+ **kwargs,
2042
2073
  ):
2043
2074
  hidden_states = kwargs.pop("hidden_states", None)
2044
2075
  inputs = super().prepare_inputs_for_generation(
2045
- input_ids, past_key_values, attention_mask, inputs_embeds, cache_position, **kwargs
2076
+ input_ids,
2077
+ past_key_values,
2078
+ attention_mask,
2079
+ inputs_embeds,
2080
+ cache_position,
2081
+ is_first_iteration=is_first_iteration,
2082
+ **kwargs,
2046
2083
  )
2047
- # Decode stage
2084
+
2085
+ # Qwen3-Omni will prepare position ids in forward with deltas
2086
+ inputs["position_ids"] = None
2087
+
2048
2088
  # TODO(raushan, gante): Refactor this part to a utility function
2049
- if cache_position[0] != 0:
2089
+ if not is_first_iteration and kwargs.get("use_cache", True):
2050
2090
  input_ids = input_ids[:, -1:]
2051
2091
  generation_step = kwargs.get("generation_step")
2052
2092
  trailing_text_hidden = kwargs.get("trailing_text_hidden")
@@ -2339,7 +2379,9 @@ class Qwen3OmniMoeCode2WavDecoderBlock(Qwen3OmniMoePreTrainedModel):
2339
2379
 
2340
2380
  self.block = nn.ModuleList(block)
2341
2381
 
2342
- def forward(self, hidden):
2382
+ self.post_init()
2383
+
2384
+ def forward(self, hidden, **kwargs):
2343
2385
  for block in self.block:
2344
2386
  hidden = block(hidden)
2345
2387
  return hidden
@@ -2381,7 +2423,7 @@ class Qwen3OmniMoeCode2Wav(Qwen3OmniMoePreTrainedModel):
2381
2423
 
2382
2424
  self.post_init()
2383
2425
 
2384
- def forward(self, codes):
2426
+ def forward(self, codes, **kwargs):
2385
2427
  if codes.shape[1] != self.config.num_quantizers:
2386
2428
  raise ValueError(f"Expected {self.config.num_quantizers} layer of codes, got {codes.shape[1]}")
2387
2429
  hidden = self.code_embedding(codes + self.code_offset).mean(1)
@@ -110,7 +110,7 @@ class Qwen3VLTextConfig(PreTrainedConfig):
110
110
  Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
111
111
  a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
112
112
  with longer `max_position_embeddings`.
113
- attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
113
+ attention_bias (`bool`, *optional*, defaults to `False`):
114
114
  Whether to use a bias in the query, key, value and output projection layers during self-attention.
115
115
  attention_dropout (`float`, *optional*, defaults to 0.0):
116
116
  The dropout ratio for the attention probabilities.
@@ -197,13 +197,13 @@ class Qwen3VLConfig(PreTrainedConfig):
197
197
  vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLVisionConfig`):
198
198
  The config object or dictionary of the vision backbone.
199
199
  image_token_id (`int`, *optional*, defaults to 151655):
200
- The image token index to encode the image prompt.
200
+ The token id used as the placeholder for image inputs.
201
201
  video_token_id (`int`, *optional*, defaults to 151656):
202
- The video token index to encode the image prompt.
202
+ The token id used as the placeholder for video inputs.
203
203
  vision_start_token_id (`int`, *optional*, defaults to 151652):
204
- The start token index to encode the image prompt.
204
+ The token id that marks the start of a vision segment (image or video).
205
205
  vision_end_token_id (`int`, *optional*, defaults to 151653):
206
- The end token index to encode the image prompt.
206
+ The token id that marks the end of a vision segment (image or video).
207
207
  tie_word_embeddings (`bool`, *optional*, defaults to `False`):
208
208
  Whether to tie the word embeddings.
209
209