transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (671) hide show
  1. transformers/__init__.py +20 -1
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/configuration_utils.py +114 -70
  6. transformers/conversion_mapping.py +68 -5
  7. transformers/core_model_loading.py +201 -35
  8. transformers/dependency_versions_table.py +1 -1
  9. transformers/feature_extraction_utils.py +54 -22
  10. transformers/generation/candidate_generator.py +79 -31
  11. transformers/generation/configuration_utils.py +162 -122
  12. transformers/generation/continuous_batching/cache.py +47 -18
  13. transformers/generation/continuous_batching/cache_manager.py +131 -34
  14. transformers/generation/continuous_batching/continuous_api.py +101 -64
  15. transformers/generation/continuous_batching/requests.py +28 -1
  16. transformers/generation/continuous_batching/scheduler.py +11 -4
  17. transformers/generation/stopping_criteria.py +1 -1
  18. transformers/generation/utils.py +108 -110
  19. transformers/generation/watermarking.py +8 -5
  20. transformers/image_processing_base.py +2 -12
  21. transformers/image_processing_utils_fast.py +15 -4
  22. transformers/initialization.py +37 -0
  23. transformers/integrations/__init__.py +12 -0
  24. transformers/integrations/accelerate.py +44 -111
  25. transformers/integrations/aqlm.py +3 -5
  26. transformers/integrations/awq.py +2 -5
  27. transformers/integrations/bitnet.py +5 -8
  28. transformers/integrations/bitsandbytes.py +16 -15
  29. transformers/integrations/deepspeed.py +18 -3
  30. transformers/integrations/eetq.py +3 -5
  31. transformers/integrations/fbgemm_fp8.py +1 -1
  32. transformers/integrations/finegrained_fp8.py +6 -16
  33. transformers/integrations/flash_attention.py +2 -2
  34. transformers/integrations/higgs.py +2 -5
  35. transformers/integrations/hub_kernels.py +23 -5
  36. transformers/integrations/integration_utils.py +35 -0
  37. transformers/integrations/mistral.py +12 -0
  38. transformers/integrations/moe.py +240 -0
  39. transformers/integrations/mxfp4.py +4 -10
  40. transformers/integrations/peft.py +5 -0
  41. transformers/integrations/quanto.py +5 -2
  42. transformers/integrations/spqr.py +3 -5
  43. transformers/integrations/tensor_parallel.py +167 -221
  44. transformers/integrations/vptq.py +3 -5
  45. transformers/modeling_gguf_pytorch_utils.py +66 -19
  46. transformers/modeling_rope_utils.py +78 -81
  47. transformers/modeling_utils.py +583 -503
  48. transformers/models/__init__.py +19 -0
  49. transformers/models/afmoe/modeling_afmoe.py +7 -16
  50. transformers/models/afmoe/modular_afmoe.py +5 -13
  51. transformers/models/aimv2/modeling_aimv2.py +4 -0
  52. transformers/models/aimv2/modular_aimv2.py +4 -0
  53. transformers/models/albert/modeling_albert.py +3 -0
  54. transformers/models/align/modeling_align.py +12 -6
  55. transformers/models/altclip/modeling_altclip.py +7 -3
  56. transformers/models/apertus/modeling_apertus.py +4 -2
  57. transformers/models/apertus/modular_apertus.py +4 -1
  58. transformers/models/arcee/modeling_arcee.py +1 -1
  59. transformers/models/aria/modeling_aria.py +8 -4
  60. transformers/models/aria/modular_aria.py +7 -3
  61. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  62. transformers/models/auto/auto_factory.py +1 -1
  63. transformers/models/auto/configuration_auto.py +27 -0
  64. transformers/models/auto/feature_extraction_auto.py +7 -3
  65. transformers/models/auto/image_processing_auto.py +4 -2
  66. transformers/models/auto/modeling_auto.py +31 -0
  67. transformers/models/auto/processing_auto.py +4 -0
  68. transformers/models/auto/tokenization_auto.py +132 -153
  69. transformers/models/auto/video_processing_auto.py +5 -2
  70. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  71. transformers/models/bamba/modeling_bamba.py +18 -19
  72. transformers/models/bamba/modular_bamba.py +17 -16
  73. transformers/models/bark/modeling_bark.py +9 -0
  74. transformers/models/bart/configuration_bart.py +0 -1
  75. transformers/models/bart/modeling_bart.py +7 -0
  76. transformers/models/beit/image_processing_beit_fast.py +0 -1
  77. transformers/models/bert/modeling_bert.py +3 -0
  78. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  79. transformers/models/big_bird/modeling_big_bird.py +3 -0
  80. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
  81. transformers/models/bit/modeling_bit.py +5 -1
  82. transformers/models/bitnet/modeling_bitnet.py +1 -1
  83. transformers/models/blenderbot/modeling_blenderbot.py +7 -0
  84. transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
  85. transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
  86. transformers/models/blip/modeling_blip.py +2 -0
  87. transformers/models/blip/modeling_blip_text.py +8 -0
  88. transformers/models/blip_2/modeling_blip_2.py +2 -0
  89. transformers/models/bloom/modeling_bloom.py +13 -44
  90. transformers/models/blt/modeling_blt.py +162 -2
  91. transformers/models/blt/modular_blt.py +168 -3
  92. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  93. transformers/models/bridgetower/modeling_bridgetower.py +6 -0
  94. transformers/models/bros/modeling_bros.py +8 -0
  95. transformers/models/camembert/modeling_camembert.py +109 -106
  96. transformers/models/canine/modeling_canine.py +6 -0
  97. transformers/models/canine/tokenization_canine.py +2 -0
  98. transformers/models/chameleon/modeling_chameleon.py +9 -4
  99. transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
  100. transformers/models/clap/feature_extraction_clap.py +2 -2
  101. transformers/models/clap/modeling_clap.py +25 -15
  102. transformers/models/clip/modeling_clip.py +2 -0
  103. transformers/models/clipseg/modeling_clipseg.py +4 -0
  104. transformers/models/clvp/modeling_clvp.py +14 -3
  105. transformers/models/code_llama/tokenization_code_llama.py +1 -1
  106. transformers/models/codegen/modeling_codegen.py +13 -4
  107. transformers/models/cohere/modeling_cohere.py +1 -1
  108. transformers/models/cohere2/modeling_cohere2.py +1 -1
  109. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
  110. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  111. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  112. transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
  113. transformers/models/convbert/modeling_convbert.py +3 -0
  114. transformers/models/convnext/image_processing_convnext.py +2 -2
  115. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  116. transformers/models/csm/generation_csm.py +19 -22
  117. transformers/models/csm/modeling_csm.py +3 -1
  118. transformers/models/csm/modular_csm.py +2 -0
  119. transformers/models/ctrl/modeling_ctrl.py +14 -2
  120. transformers/models/cvt/modeling_cvt.py +5 -1
  121. transformers/models/cwm/modeling_cwm.py +1 -1
  122. transformers/models/d_fine/configuration_d_fine.py +3 -4
  123. transformers/models/d_fine/modeling_d_fine.py +46 -39
  124. transformers/models/d_fine/modular_d_fine.py +15 -4
  125. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  126. transformers/models/dab_detr/modeling_dab_detr.py +1 -1
  127. transformers/models/dac/modeling_dac.py +4 -4
  128. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  129. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  130. transformers/models/dbrx/configuration_dbrx.py +9 -1
  131. transformers/models/dbrx/modeling_dbrx.py +1 -1
  132. transformers/models/deberta/modeling_deberta.py +2 -0
  133. transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
  134. transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
  135. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
  136. transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
  137. transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
  138. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
  139. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  140. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  141. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  142. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  143. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  144. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  145. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  146. transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
  147. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  148. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  149. transformers/models/detr/configuration_detr.py +1 -1
  150. transformers/models/detr/modeling_detr.py +8 -1
  151. transformers/models/dia/generation_dia.py +3 -10
  152. transformers/models/dia/modeling_dia.py +12 -1
  153. transformers/models/dia/modular_dia.py +11 -0
  154. transformers/models/dia/processing_dia.py +1 -1
  155. transformers/models/diffllama/modeling_diffllama.py +3 -3
  156. transformers/models/diffllama/modular_diffllama.py +2 -2
  157. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  158. transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
  159. transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
  160. transformers/models/distilbert/modeling_distilbert.py +11 -9
  161. transformers/models/doge/modeling_doge.py +1 -1
  162. transformers/models/donut/image_processing_donut_fast.py +0 -1
  163. transformers/models/donut/modeling_donut_swin.py +16 -12
  164. transformers/models/dots1/modeling_dots1.py +14 -5
  165. transformers/models/dpt/configuration_dpt.py +1 -1
  166. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  167. transformers/models/dpt/modular_dpt.py +1 -2
  168. transformers/models/edgetam/configuration_edgetam.py +1 -1
  169. transformers/models/edgetam/modeling_edgetam.py +5 -2
  170. transformers/models/edgetam/modular_edgetam.py +15 -14
  171. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
  172. transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
  173. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  174. transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
  175. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  176. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  177. transformers/models/efficientnet/modeling_efficientnet.py +5 -1
  178. transformers/models/electra/modeling_electra.py +7 -0
  179. transformers/models/emu3/modeling_emu3.py +8 -2
  180. transformers/models/emu3/modular_emu3.py +7 -1
  181. transformers/models/encodec/modeling_encodec.py +14 -0
  182. transformers/models/eomt/image_processing_eomt_fast.py +46 -14
  183. transformers/models/eomt/modeling_eomt.py +7 -0
  184. transformers/models/eomt/modular_eomt.py +7 -0
  185. transformers/models/ernie/modeling_ernie.py +6 -0
  186. transformers/models/ernie/modular_ernie.py +6 -0
  187. transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
  188. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
  189. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
  190. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  191. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  192. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  193. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  194. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  195. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  196. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  197. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  198. transformers/models/esm/modeling_esm.py +6 -0
  199. transformers/models/esm/modeling_esmfold.py +6 -1
  200. transformers/models/evolla/modeling_evolla.py +9 -1
  201. transformers/models/evolla/modular_evolla.py +8 -0
  202. transformers/models/exaone4/modeling_exaone4.py +1 -1
  203. transformers/models/falcon/modeling_falcon.py +3 -3
  204. transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
  205. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  206. transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
  207. transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
  208. transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
  209. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
  210. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  211. transformers/models/flaubert/modeling_flaubert.py +14 -15
  212. transformers/models/flava/image_processing_flava_fast.py +0 -2
  213. transformers/models/flava/modeling_flava.py +4 -1
  214. transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
  215. transformers/models/florence2/modeling_florence2.py +20 -3
  216. transformers/models/florence2/modular_florence2.py +13 -0
  217. transformers/models/fnet/modeling_fnet.py +7 -0
  218. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  219. transformers/models/fuyu/modeling_fuyu.py +3 -1
  220. transformers/models/fuyu/processing_fuyu.py +16 -0
  221. transformers/models/gemma/modeling_gemma.py +10 -12
  222. transformers/models/gemma/modular_gemma.py +9 -11
  223. transformers/models/gemma2/modeling_gemma2.py +1 -1
  224. transformers/models/gemma2/modular_gemma2.py +1 -1
  225. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  226. transformers/models/gemma3/modeling_gemma3.py +28 -7
  227. transformers/models/gemma3/modular_gemma3.py +26 -6
  228. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  229. transformers/models/gemma3n/modeling_gemma3n.py +47 -9
  230. transformers/models/gemma3n/modular_gemma3n.py +51 -9
  231. transformers/models/git/modeling_git.py +181 -126
  232. transformers/models/glm/modeling_glm.py +1 -1
  233. transformers/models/glm4/modeling_glm4.py +1 -1
  234. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  235. transformers/models/glm46v/modeling_glm46v.py +3 -1
  236. transformers/models/glm46v/modular_glm46v.py +3 -0
  237. transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
  238. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  239. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  240. transformers/models/glm4v/modeling_glm4v.py +15 -5
  241. transformers/models/glm4v/modular_glm4v.py +11 -3
  242. transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
  243. transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
  244. transformers/models/glmasr/__init__.py +30 -0
  245. transformers/models/glmasr/configuration_glmasr.py +197 -0
  246. transformers/models/glmasr/modeling_glmasr.py +512 -0
  247. transformers/models/glmasr/modular_glmasr.py +433 -0
  248. transformers/models/glmasr/processing_glmasr.py +332 -0
  249. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  250. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  251. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  252. transformers/models/gpt2/modeling_gpt2.py +8 -5
  253. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
  254. transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
  255. transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
  256. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
  257. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  258. transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
  259. transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
  260. transformers/models/gptj/modeling_gptj.py +15 -6
  261. transformers/models/granite/modeling_granite.py +1 -1
  262. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  263. transformers/models/granitemoe/modeling_granitemoe.py +2 -3
  264. transformers/models/granitemoe/modular_granitemoe.py +1 -2
  265. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  266. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
  267. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  268. transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
  269. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  270. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
  271. transformers/models/groupvit/modeling_groupvit.py +6 -1
  272. transformers/models/helium/modeling_helium.py +1 -1
  273. transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
  274. transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
  275. transformers/models/hubert/modeling_hubert.py +4 -0
  276. transformers/models/hubert/modular_hubert.py +4 -0
  277. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
  278. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  279. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  280. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
  281. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  282. transformers/models/ibert/modeling_ibert.py +16 -0
  283. transformers/models/idefics/modeling_idefics.py +10 -0
  284. transformers/models/idefics2/modeling_idefics2.py +7 -1
  285. transformers/models/idefics3/modeling_idefics3.py +5 -1
  286. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  287. transformers/models/imagegpt/modeling_imagegpt.py +9 -2
  288. transformers/models/instructblip/modeling_instructblip.py +2 -0
  289. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  290. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  291. transformers/models/internvl/modeling_internvl.py +11 -8
  292. transformers/models/internvl/modular_internvl.py +5 -9
  293. transformers/models/internvl/video_processing_internvl.py +0 -1
  294. transformers/models/jais2/__init__.py +27 -0
  295. transformers/models/jais2/configuration_jais2.py +152 -0
  296. transformers/models/jais2/modeling_jais2.py +486 -0
  297. transformers/models/jais2/modular_jais2.py +196 -0
  298. transformers/models/jamba/modeling_jamba.py +24 -19
  299. transformers/models/jamba/modular_jamba.py +17 -17
  300. transformers/models/janus/image_processing_janus_fast.py +0 -1
  301. transformers/models/janus/modeling_janus.py +15 -7
  302. transformers/models/janus/modular_janus.py +16 -7
  303. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  304. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  305. transformers/models/kosmos2/modeling_kosmos2.py +14 -2
  306. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  307. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  308. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
  309. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  310. transformers/models/lasr/configuration_lasr.py +4 -0
  311. transformers/models/lasr/modeling_lasr.py +3 -2
  312. transformers/models/lasr/modular_lasr.py +8 -1
  313. transformers/models/lasr/processing_lasr.py +0 -2
  314. transformers/models/layoutlm/modeling_layoutlm.py +5 -3
  315. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  316. transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
  317. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
  318. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  319. transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
  320. transformers/models/led/modeling_led.py +6 -0
  321. transformers/models/levit/modeling_levit.py +18 -0
  322. transformers/models/lfm2/modeling_lfm2.py +1 -1
  323. transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
  324. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  325. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  326. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  327. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  328. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  329. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  330. transformers/models/lilt/modeling_lilt.py +19 -15
  331. transformers/models/llama/modeling_llama.py +1 -1
  332. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  333. transformers/models/llama4/modeling_llama4.py +8 -4
  334. transformers/models/llava/image_processing_llava_fast.py +0 -1
  335. transformers/models/llava/modeling_llava.py +12 -7
  336. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  337. transformers/models/llava_next/modeling_llava_next.py +7 -3
  338. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  339. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  340. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  341. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  342. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  343. transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
  344. transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
  345. transformers/models/longt5/modeling_longt5.py +0 -4
  346. transformers/models/m2m_100/modeling_m2m_100.py +10 -0
  347. transformers/models/mamba/modeling_mamba.py +2 -1
  348. transformers/models/mamba2/modeling_mamba2.py +24 -23
  349. transformers/models/marian/configuration_marian.py +1 -1
  350. transformers/models/marian/modeling_marian.py +3 -0
  351. transformers/models/markuplm/modeling_markuplm.py +5 -8
  352. transformers/models/mask2former/configuration_mask2former.py +3 -3
  353. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  354. transformers/models/mask2former/modeling_mask2former.py +9 -0
  355. transformers/models/maskformer/configuration_maskformer.py +3 -3
  356. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  357. transformers/models/maskformer/modeling_maskformer.py +9 -1
  358. transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
  359. transformers/models/mbart/configuration_mbart.py +1 -0
  360. transformers/models/mbart/modeling_mbart.py +7 -0
  361. transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
  362. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  363. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  364. transformers/models/mimi/modeling_mimi.py +25 -4
  365. transformers/models/minimax/modeling_minimax.py +16 -3
  366. transformers/models/minimax/modular_minimax.py +12 -1
  367. transformers/models/ministral/modeling_ministral.py +1 -1
  368. transformers/models/ministral3/modeling_ministral3.py +1 -1
  369. transformers/models/mistral/modeling_mistral.py +1 -1
  370. transformers/models/mistral3/modeling_mistral3.py +10 -4
  371. transformers/models/mistral3/modular_mistral3.py +3 -1
  372. transformers/models/mixtral/modeling_mixtral.py +12 -4
  373. transformers/models/mixtral/modular_mixtral.py +6 -2
  374. transformers/models/mlcd/modeling_mlcd.py +6 -0
  375. transformers/models/mlcd/modular_mlcd.py +4 -0
  376. transformers/models/mllama/modeling_mllama.py +13 -2
  377. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  378. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
  379. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  380. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  381. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  382. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  383. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  384. transformers/models/mobilevit/modeling_mobilevit.py +4 -0
  385. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
  386. transformers/models/modernbert/modeling_modernbert.py +12 -1
  387. transformers/models/modernbert/modular_modernbert.py +12 -1
  388. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
  389. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
  390. transformers/models/moonshine/modeling_moonshine.py +1 -1
  391. transformers/models/moshi/modeling_moshi.py +21 -51
  392. transformers/models/mpnet/modeling_mpnet.py +2 -0
  393. transformers/models/mra/modeling_mra.py +4 -1
  394. transformers/models/mt5/configuration_mt5.py +2 -3
  395. transformers/models/mt5/modeling_mt5.py +0 -10
  396. transformers/models/musicgen/modeling_musicgen.py +5 -9
  397. transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
  398. transformers/models/mvp/modeling_mvp.py +7 -0
  399. transformers/models/nanochat/modeling_nanochat.py +1 -1
  400. transformers/models/nemotron/modeling_nemotron.py +3 -3
  401. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  402. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  403. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  404. transformers/models/nougat/tokenization_nougat.py +11 -16
  405. transformers/models/nystromformer/modeling_nystromformer.py +7 -0
  406. transformers/models/olmo/modeling_olmo.py +1 -1
  407. transformers/models/olmo2/modeling_olmo2.py +1 -1
  408. transformers/models/olmo3/modeling_olmo3.py +1 -1
  409. transformers/models/olmoe/modeling_olmoe.py +12 -4
  410. transformers/models/olmoe/modular_olmoe.py +4 -2
  411. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  412. transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
  413. transformers/models/oneformer/configuration_oneformer.py +3 -3
  414. transformers/models/oneformer/modeling_oneformer.py +7 -38
  415. transformers/models/openai/modeling_openai.py +12 -0
  416. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  417. transformers/models/ovis2/modeling_ovis2.py +15 -3
  418. transformers/models/ovis2/modular_ovis2.py +8 -0
  419. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  420. transformers/models/owlv2/modeling_owlv2.py +7 -3
  421. transformers/models/owlv2/modular_owlv2.py +0 -2
  422. transformers/models/owlvit/modeling_owlvit.py +7 -3
  423. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
  424. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
  425. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
  426. transformers/models/paligemma/modeling_paligemma.py +25 -17
  427. transformers/models/parakeet/modeling_parakeet.py +5 -0
  428. transformers/models/parakeet/modular_parakeet.py +5 -0
  429. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  430. transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
  431. transformers/models/patchtst/modeling_patchtst.py +5 -4
  432. transformers/models/pe_audio/__init__.py +30 -0
  433. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  434. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  435. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  436. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  437. transformers/models/pe_audio/processing_pe_audio.py +24 -0
  438. transformers/models/pe_audio_video/__init__.py +29 -0
  439. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  440. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  441. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  442. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  443. transformers/models/pe_video/__init__.py +30 -0
  444. transformers/models/pe_video/configuration_pe_video.py +211 -0
  445. transformers/models/pe_video/modeling_pe_video.py +636 -0
  446. transformers/models/pe_video/modular_pe_video.py +219 -0
  447. transformers/models/pe_video/processing_pe_video.py +10 -0
  448. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  449. transformers/models/pegasus/configuration_pegasus.py +1 -0
  450. transformers/models/pegasus/modeling_pegasus.py +3 -0
  451. transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
  452. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  453. transformers/models/perceiver/modeling_perceiver.py +5 -1
  454. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  455. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  456. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  457. transformers/models/persimmon/modeling_persimmon.py +1 -1
  458. transformers/models/phi/modeling_phi.py +1 -1
  459. transformers/models/phi3/modeling_phi3.py +1 -1
  460. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
  461. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
  462. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  463. transformers/models/phimoe/modeling_phimoe.py +12 -4
  464. transformers/models/phimoe/modular_phimoe.py +1 -1
  465. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  466. transformers/models/pixio/__init__.py +30 -0
  467. transformers/models/pixio/configuration_pixio.py +151 -0
  468. transformers/models/pixio/modeling_pixio.py +507 -0
  469. transformers/models/pixio/modular_pixio.py +404 -0
  470. transformers/models/pixtral/modeling_pixtral.py +1 -1
  471. transformers/models/pixtral/processing_pixtral.py +3 -1
  472. transformers/models/plbart/configuration_plbart.py +1 -0
  473. transformers/models/plbart/modeling_plbart.py +7 -0
  474. transformers/models/plbart/modular_plbart.py +6 -0
  475. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  476. transformers/models/poolformer/modeling_poolformer.py +11 -1
  477. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  478. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  479. transformers/models/prophetnet/modeling_prophetnet.py +2 -1
  480. transformers/models/qwen2/modeling_qwen2.py +1 -1
  481. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
  482. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
  483. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
  484. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
  485. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
  486. transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
  487. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  488. transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
  489. transformers/models/qwen3/modeling_qwen3.py +1 -1
  490. transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
  491. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
  492. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  493. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
  494. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
  495. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  496. transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
  497. transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
  498. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  499. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
  500. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
  501. transformers/models/rag/configuration_rag.py +0 -8
  502. transformers/models/rag/modeling_rag.py +7 -9
  503. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
  504. transformers/models/reformer/modeling_reformer.py +9 -1
  505. transformers/models/regnet/modeling_regnet.py +4 -0
  506. transformers/models/rembert/modeling_rembert.py +7 -1
  507. transformers/models/resnet/modeling_resnet.py +8 -3
  508. transformers/models/roberta/modeling_roberta.py +3 -0
  509. transformers/models/roberta/modular_roberta.py +3 -0
  510. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  511. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  512. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  513. transformers/models/rt_detr/modeling_rt_detr.py +4 -0
  514. transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
  515. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  516. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
  517. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  518. transformers/models/rwkv/modeling_rwkv.py +1 -1
  519. transformers/models/sam/configuration_sam.py +1 -0
  520. transformers/models/sam/image_processing_sam_fast.py +0 -1
  521. transformers/models/sam/modeling_sam.py +4 -1
  522. transformers/models/sam2/configuration_sam2.py +1 -1
  523. transformers/models/sam2/modeling_sam2.py +5 -1
  524. transformers/models/sam2/modular_sam2.py +5 -1
  525. transformers/models/sam2_video/modeling_sam2_video.py +51 -43
  526. transformers/models/sam2_video/modular_sam2_video.py +31 -18
  527. transformers/models/sam3/configuration_sam3.py +21 -1
  528. transformers/models/sam3/modeling_sam3.py +23 -0
  529. transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
  530. transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
  531. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  532. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
  533. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  534. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  535. transformers/models/sam3_video/modeling_sam3_video.py +3 -3
  536. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  537. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  538. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  539. transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
  540. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
  541. transformers/models/seed_oss/modeling_seed_oss.py +1 -1
  542. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  543. transformers/models/segformer/modeling_segformer.py +2 -2
  544. transformers/models/segformer/modular_segformer.py +0 -1
  545. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  546. transformers/models/siglip/modeling_siglip.py +24 -2
  547. transformers/models/siglip2/modeling_siglip2.py +63 -41
  548. transformers/models/smollm3/modeling_smollm3.py +1 -1
  549. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  550. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  551. transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
  552. transformers/models/speecht5/modeling_speecht5.py +28 -0
  553. transformers/models/splinter/modeling_splinter.py +9 -3
  554. transformers/models/squeezebert/modeling_squeezebert.py +2 -0
  555. transformers/models/stablelm/modeling_stablelm.py +1 -1
  556. transformers/models/starcoder2/modeling_starcoder2.py +1 -1
  557. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  558. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  559. transformers/models/swiftformer/modeling_swiftformer.py +4 -0
  560. transformers/models/swin/modeling_swin.py +16 -12
  561. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  562. transformers/models/swin2sr/modeling_swin2sr.py +49 -33
  563. transformers/models/swinv2/modeling_swinv2.py +41 -33
  564. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  565. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  566. transformers/models/t5/configuration_t5.py +7 -1
  567. transformers/models/t5/modeling_t5.py +1 -7
  568. transformers/models/t5gemma/modeling_t5gemma.py +1 -1
  569. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  570. transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
  571. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  572. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  573. transformers/models/table_transformer/modeling_table_transformer.py +1 -1
  574. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  575. transformers/models/timesfm/modeling_timesfm.py +12 -0
  576. transformers/models/timesfm/modular_timesfm.py +12 -0
  577. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  578. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  579. transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
  580. transformers/models/trocr/modeling_trocr.py +1 -2
  581. transformers/models/tvp/configuration_tvp.py +5 -1
  582. transformers/models/tvp/modeling_tvp.py +4 -4
  583. transformers/models/udop/configuration_udop.py +1 -0
  584. transformers/models/udop/modeling_udop.py +3 -7
  585. transformers/models/umt5/configuration_umt5.py +2 -2
  586. transformers/models/umt5/modeling_umt5.py +0 -6
  587. transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
  588. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  589. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  590. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  591. transformers/models/video_llava/modeling_video_llava.py +7 -3
  592. transformers/models/vilt/configuration_vilt.py +2 -2
  593. transformers/models/vilt/modeling_vilt.py +7 -0
  594. transformers/models/vipllava/modeling_vipllava.py +7 -3
  595. transformers/models/visual_bert/modeling_visual_bert.py +2 -0
  596. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  597. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  598. transformers/models/vitmatte/modeling_vitmatte.py +4 -0
  599. transformers/models/vitpose/configuration_vitpose.py +1 -1
  600. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  601. transformers/models/voxtral/modeling_voxtral.py +2 -2
  602. transformers/models/voxtral/modular_voxtral.py +2 -2
  603. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
  604. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
  605. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
  606. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  607. transformers/models/whisper/generation_whisper.py +1 -0
  608. transformers/models/whisper/modeling_whisper.py +5 -3
  609. transformers/models/x_clip/modeling_x_clip.py +2 -0
  610. transformers/models/xcodec/modeling_xcodec.py +5 -0
  611. transformers/models/xglm/modeling_xglm.py +10 -0
  612. transformers/models/xlm/modeling_xlm.py +13 -14
  613. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  614. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  615. transformers/models/xlnet/modeling_xlnet.py +3 -1
  616. transformers/models/xmod/modeling_xmod.py +3 -0
  617. transformers/models/yoso/modeling_yoso.py +4 -1
  618. transformers/models/zamba/modeling_zamba.py +2 -1
  619. transformers/models/zamba2/modeling_zamba2.py +3 -2
  620. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  621. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  622. transformers/models/zoedepth/modeling_zoedepth.py +7 -0
  623. transformers/pipelines/__init__.py +9 -6
  624. transformers/pipelines/automatic_speech_recognition.py +20 -12
  625. transformers/pipelines/base.py +1 -1
  626. transformers/pipelines/document_question_answering.py +1 -1
  627. transformers/pipelines/question_answering.py +1 -1
  628. transformers/pipelines/text_to_audio.py +2 -2
  629. transformers/processing_utils.py +127 -56
  630. transformers/quantizers/auto.py +2 -4
  631. transformers/quantizers/base.py +9 -64
  632. transformers/quantizers/quantizer_aqlm.py +1 -18
  633. transformers/quantizers/quantizer_auto_round.py +1 -10
  634. transformers/quantizers/quantizer_awq.py +3 -8
  635. transformers/quantizers/quantizer_bitnet.py +1 -6
  636. transformers/quantizers/quantizer_bnb_4bit.py +9 -49
  637. transformers/quantizers/quantizer_bnb_8bit.py +9 -19
  638. transformers/quantizers/quantizer_compressed_tensors.py +1 -4
  639. transformers/quantizers/quantizer_eetq.py +2 -12
  640. transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
  641. transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
  642. transformers/quantizers/quantizer_fp_quant.py +4 -4
  643. transformers/quantizers/quantizer_gptq.py +1 -4
  644. transformers/quantizers/quantizer_higgs.py +2 -6
  645. transformers/quantizers/quantizer_mxfp4.py +2 -28
  646. transformers/quantizers/quantizer_quanto.py +14 -14
  647. transformers/quantizers/quantizer_spqr.py +3 -8
  648. transformers/quantizers/quantizer_torchao.py +28 -124
  649. transformers/quantizers/quantizer_vptq.py +1 -10
  650. transformers/testing_utils.py +28 -12
  651. transformers/tokenization_mistral_common.py +3 -2
  652. transformers/tokenization_utils_base.py +3 -2
  653. transformers/tokenization_utils_tokenizers.py +25 -2
  654. transformers/trainer.py +24 -2
  655. transformers/trainer_callback.py +8 -0
  656. transformers/trainer_seq2seq.py +4 -0
  657. transformers/training_args.py +8 -10
  658. transformers/utils/__init__.py +4 -0
  659. transformers/utils/attention_visualizer.py +4 -4
  660. transformers/utils/auto_docstring.py +34 -25
  661. transformers/utils/generic.py +20 -0
  662. transformers/utils/import_utils.py +51 -9
  663. transformers/utils/kernel_config.py +71 -18
  664. transformers/utils/quantization_config.py +8 -8
  665. transformers/video_processing_utils.py +16 -12
  666. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
  667. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
  668. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
  669. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  670. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
  671. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,7 @@ import torch.nn as nn
28
28
  import torch.nn.functional as F
29
29
  from torch.nn import LayerNorm
30
30
 
31
+ from ... import initialization as init
31
32
  from ...activations import ACT2FN
32
33
  from ...cache_utils import Cache, DynamicCache
33
34
  from ...generation import GenerationMixin
@@ -104,6 +105,8 @@ class Glm4vVisionRotaryEmbedding(nn.Module):
104
105
 
105
106
  def __init__(self, dim: int, theta: float = 10000.0) -> None:
106
107
  super().__init__()
108
+ self.dim = dim
109
+ self.theta = theta
107
110
  inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
108
111
  self.register_buffer("inv_freq", inv_freq, persistent=False)
109
112
 
@@ -141,7 +144,6 @@ class Glm4vVisionEmbeddings(nn.Module):
141
144
  self.num_patches = (self.image_size // self.patch_size) ** 2
142
145
  self.num_positions = self.num_patches
143
146
  self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
144
- self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
145
147
 
146
148
  def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
147
149
  """
@@ -313,8 +315,8 @@ class Glm4vVisionAttention(nn.Module):
313
315
  if self.config._attn_implementation != "eager":
314
316
  attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
315
317
 
316
- if self.config._attn_implementation == "flash_attention_2":
317
- # Flash Attention 2: Use cu_seqlens for variable length attention
318
+ if "flash" in self.config._attn_implementation:
319
+ # Flash Attention: Use cu_seqlens for variable length attention
318
320
  max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
319
321
  attn_output, _ = attention_interface(
320
322
  self,
@@ -403,7 +405,7 @@ class Glm4vTextRotaryEmbedding(nn.Module):
403
405
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
404
406
 
405
407
  self.register_buffer("inv_freq", inv_freq, persistent=False)
406
- self.original_inv_freq = inv_freq
408
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
407
409
 
408
410
  @staticmethod
409
411
  def compute_default_rope_parameters(
@@ -705,6 +707,12 @@ class Glm4vPreTrainedModel(PreTrainedModel):
705
707
  "attentions": Glm4vTextAttention,
706
708
  }
707
709
 
710
+ def _init_weights(self, module):
711
+ super()._init_weights(module)
712
+ if isinstance(module, Glm4vVisionRotaryEmbedding):
713
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
714
+ init.copy_(module.inv_freq, inv_freq)
715
+
708
716
 
709
717
  class Glm4vVisionModel(Glm4vPreTrainedModel):
710
718
  config: Glm4vVisionConfig
@@ -1487,6 +1495,7 @@ class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
1487
1495
  pixel_values_videos=None,
1488
1496
  image_grid_thw=None,
1489
1497
  video_grid_thw=None,
1498
+ is_first_iteration=False,
1490
1499
  **kwargs,
1491
1500
  ):
1492
1501
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -1503,13 +1512,14 @@ class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
1503
1512
  image_grid_thw=image_grid_thw,
1504
1513
  video_grid_thw=video_grid_thw,
1505
1514
  use_cache=use_cache,
1515
+ is_first_iteration=is_first_iteration,
1506
1516
  **kwargs,
1507
1517
  )
1508
1518
 
1509
1519
  # GLM-4.1V position_ids are prepareed with rope_deltas in forward
1510
1520
  model_inputs["position_ids"] = None
1511
1521
 
1512
- if cache_position[0] != 0:
1522
+ if not is_first_iteration and use_cache:
1513
1523
  model_inputs["pixel_values"] = None
1514
1524
  model_inputs["pixel_values_videos"] = None
1515
1525
 
@@ -22,6 +22,7 @@ import torch.nn as nn
22
22
  import torch.nn.functional as F
23
23
  from torch.nn import LayerNorm
24
24
 
25
+ from ... import initialization as init
25
26
  from ...activations import ACT2FN
26
27
  from ...cache_utils import Cache, DynamicCache
27
28
  from ...configuration_utils import PreTrainedConfig
@@ -32,7 +33,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
32
33
  from ...modeling_layers import GradientCheckpointingLayer
33
34
  from ...modeling_outputs import BaseModelOutputWithPast
34
35
  from ...modeling_rope_utils import RopeParameters
35
- from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
36
+ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
36
37
  from ...processing_utils import Unpack
37
38
  from ...tokenization_utils_base import PreTokenizedInput, TextInput
38
39
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
@@ -409,7 +410,6 @@ class Glm4vVisionEmbeddings(nn.Module):
409
410
  self.num_patches = (self.image_size // self.patch_size) ** 2
410
411
  self.num_positions = self.num_patches
411
412
  self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
412
- self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
413
413
 
414
414
  def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
415
415
  """
@@ -725,6 +725,12 @@ class Glm4vPreTrainedModel(Qwen2_5_VLPreTrainedModel):
725
725
  "attentions": Glm4vTextAttention,
726
726
  }
727
727
 
728
+ def _init_weights(self, module):
729
+ PreTrainedModel._init_weights(self, module)
730
+ if isinstance(module, Glm4vVisionRotaryEmbedding):
731
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
732
+ init.copy_(module.inv_freq, inv_freq)
733
+
728
734
 
729
735
  class Glm4vVisionModel(Glm4vPreTrainedModel):
730
736
  config: Glm4vVisionConfig
@@ -1414,6 +1420,7 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
1414
1420
  pixel_values_videos=None,
1415
1421
  image_grid_thw=None,
1416
1422
  video_grid_thw=None,
1423
+ is_first_iteration=False,
1417
1424
  **kwargs,
1418
1425
  ):
1419
1426
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -1430,13 +1437,14 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
1430
1437
  image_grid_thw=image_grid_thw,
1431
1438
  video_grid_thw=video_grid_thw,
1432
1439
  use_cache=use_cache,
1440
+ is_first_iteration=is_first_iteration,
1433
1441
  **kwargs,
1434
1442
  )
1435
1443
 
1436
1444
  # GLM-4.1V position_ids are prepareed with rope_deltas in forward
1437
1445
  model_inputs["position_ids"] = None
1438
1446
 
1439
- if cache_position[0] != 0:
1447
+ if not is_first_iteration and use_cache:
1440
1448
  model_inputs["pixel_values"] = None
1441
1449
  model_inputs["pixel_values_videos"] = None
1442
1450
 
@@ -32,7 +32,7 @@ from ... import initialization as init
32
32
  from ...activations import ACT2FN
33
33
  from ...cache_utils import Cache, DynamicCache
34
34
  from ...generation import GenerationMixin
35
- from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
35
+ from ...integrations import use_experts_implementation, use_kernel_forward_from_hub, use_kernelized_func
36
36
  from ...masking_utils import create_causal_mask
37
37
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
38
38
  from ...modeling_layers import GradientCheckpointingLayer
@@ -40,7 +40,13 @@ from ...modeling_outputs import ModelOutput, MoeModelOutputWithPast
40
40
  from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
41
41
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
42
42
  from ...processing_utils import Unpack
43
- from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling
43
+ from ...utils import (
44
+ TransformersKwargs,
45
+ auto_docstring,
46
+ can_return_tuple,
47
+ is_grouped_mm_available,
48
+ is_torchdynamo_compiling,
49
+ )
44
50
  from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
45
51
  from .configuration_glm4v_moe import Glm4vMoeConfig, Glm4vMoeTextConfig, Glm4vMoeVisionConfig
46
52
 
@@ -107,7 +113,7 @@ class Glm4vMoeTextRotaryEmbedding(nn.Module):
107
113
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
108
114
 
109
115
  self.register_buffer("inv_freq", inv_freq, persistent=False)
110
- self.original_inv_freq = inv_freq
116
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
111
117
 
112
118
  @staticmethod
113
119
  def compute_default_rope_parameters(
@@ -395,6 +401,7 @@ class Glm4vMoeTextTopkRouter(nn.Module):
395
401
  return router_logits
396
402
 
397
403
 
404
+ @use_experts_implementation
398
405
  class Glm4vMoeTextNaiveMoe(nn.Module):
399
406
  """Collection of expert weights stored as 3D tensors."""
400
407
 
@@ -402,7 +409,7 @@ class Glm4vMoeTextNaiveMoe(nn.Module):
402
409
  super().__init__()
403
410
  self.num_experts = config.num_local_experts
404
411
  self.hidden_dim = config.hidden_size
405
- self.intermediate_dim = config.intermediate_size
412
+ self.intermediate_dim = config.moe_intermediate_size
406
413
  self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, 2 * self.intermediate_dim, self.hidden_dim))
407
414
  self.down_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, self.intermediate_dim))
408
415
  self.act_fn = ACT2FN[config.hidden_act]
@@ -586,7 +593,9 @@ class Glm4vMoePreTrainedModel(PreTrainedModel):
586
593
  _supports_flash_attn = True
587
594
  _supports_sdpa = True
588
595
  _supports_flex_attn = True
589
- _can_compile_fullgraph = False
596
+ _can_compile_fullgraph = (
597
+ is_grouped_mm_available()
598
+ ) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
590
599
  _supports_attention_backend = True
591
600
 
592
601
  _can_record_outputs = {
@@ -602,9 +611,13 @@ class Glm4vMoePreTrainedModel(PreTrainedModel):
602
611
  super()._init_weights(module)
603
612
  if isinstance(module, Glm4vMoeTextTopkRouter):
604
613
  init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
614
+ init.zeros_(module.e_score_correction_bias)
605
615
  elif isinstance(module, Glm4vMoeTextNaiveMoe):
606
616
  init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
607
617
  init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
618
+ if isinstance(module, Glm4vMoeVisionRotaryEmbedding):
619
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
620
+ init.copy_(module.inv_freq, inv_freq)
608
621
 
609
622
 
610
623
  @dataclass
@@ -637,6 +650,22 @@ class Glm4vMoeCausalLMOutputWithPast(ModelOutput):
637
650
  aux_loss: Optional[torch.FloatTensor] = None
638
651
 
639
652
 
653
+ class Glm4vMoeVisionRotaryEmbedding(nn.Module):
654
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
655
+
656
+ def __init__(self, dim: int, theta: float = 10000.0) -> None:
657
+ super().__init__()
658
+ self.dim = dim
659
+ self.theta = theta
660
+ inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
661
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
662
+
663
+ def forward(self, seqlen: int) -> torch.Tensor:
664
+ seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
665
+ freqs = torch.outer(seq, self.inv_freq)
666
+ return freqs
667
+
668
+
640
669
  class Glm4vMoeisionMlp(nn.Module):
641
670
  def __init__(self, config, bias: bool = False):
642
671
  super().__init__()
@@ -671,20 +700,6 @@ class Glm4vMoeVisionPatchEmbed(nn.Module):
671
700
  return hidden_states
672
701
 
673
702
 
674
- class Glm4vMoeVisionRotaryEmbedding(nn.Module):
675
- inv_freq: torch.Tensor # fix linting for `register_buffer`
676
-
677
- def __init__(self, dim: int, theta: float = 10000.0) -> None:
678
- super().__init__()
679
- inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
680
- self.register_buffer("inv_freq", inv_freq, persistent=False)
681
-
682
- def forward(self, seqlen: int) -> torch.Tensor:
683
- seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
684
- freqs = torch.outer(seq, self.inv_freq)
685
- return freqs
686
-
687
-
688
703
  class Glm4vMoeVisionPatchMerger(nn.Module):
689
704
  def __init__(self, dim: int, context_dim: int, hidden_act: str, bias: bool = False) -> None:
690
705
  super().__init__()
@@ -713,7 +728,6 @@ class Glm4vMoeVisionEmbeddings(nn.Module):
713
728
  self.num_patches = (self.image_size // self.patch_size) ** 2
714
729
  self.num_positions = self.num_patches
715
730
  self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
716
- self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
717
731
 
718
732
  def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
719
733
  """
@@ -840,8 +854,8 @@ class Glm4vMoeVisionAttention(nn.Module):
840
854
  if self.config._attn_implementation != "eager":
841
855
  attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
842
856
 
843
- if self.config._attn_implementation == "flash_attention_2":
844
- # Flash Attention 2: Use cu_seqlens for variable length attention
857
+ if "flash" in self.config._attn_implementation:
858
+ # Flash Attention: Use cu_seqlens for variable length attention
845
859
  max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
846
860
  attn_output, _ = attention_interface(
847
861
  self,
@@ -1763,6 +1777,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vMoePreTrainedModel, GenerationMixin)
1763
1777
  pixel_values_videos=None,
1764
1778
  image_grid_thw=None,
1765
1779
  video_grid_thw=None,
1780
+ is_first_iteration=False,
1766
1781
  **kwargs,
1767
1782
  ):
1768
1783
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -1779,13 +1794,14 @@ class Glm4vMoeForConditionalGeneration(Glm4vMoePreTrainedModel, GenerationMixin)
1779
1794
  image_grid_thw=image_grid_thw,
1780
1795
  video_grid_thw=video_grid_thw,
1781
1796
  use_cache=use_cache,
1797
+ is_first_iteration=is_first_iteration,
1782
1798
  **kwargs,
1783
1799
  )
1784
1800
 
1785
1801
  # GLM-4.1V position_ids are prepareed with rope_deltas in forward
1786
1802
  model_inputs["position_ids"] = None
1787
1803
 
1788
- if cache_position[0] != 0:
1804
+ if not is_first_iteration and use_cache:
1789
1805
  model_inputs["pixel_values"] = None
1790
1806
  model_inputs["pixel_values_videos"] = None
1791
1807
 
@@ -18,6 +18,7 @@ from typing import Optional, Union
18
18
  import torch
19
19
  import torch.nn as nn
20
20
 
21
+ from ... import initialization as init
21
22
  from ...cache_utils import Cache, DynamicCache
22
23
  from ...configuration_utils import PreTrainedConfig
23
24
  from ...masking_utils import create_causal_mask
@@ -46,6 +47,7 @@ from ..glm4v.modeling_glm4v import (
46
47
  Glm4vTextModel,
47
48
  Glm4vTextRotaryEmbedding,
48
49
  Glm4vVisionModel,
50
+ Glm4vVisionRotaryEmbedding,
49
51
  rotate_half,
50
52
  )
51
53
  from ..qwen3_vl_moe.modeling_qwen3_vl_moe import (
@@ -479,11 +481,21 @@ class Glm4vMoePreTrainedModel(Glm4MoePreTrainedModel):
479
481
  "router_logits": OutputRecorder(nn.Linear, layer_name="mlp.gate", index=0),
480
482
  }
481
483
 
484
+ def _init_weights(self, module):
485
+ super()._init_weights(module)
486
+ if isinstance(module, Glm4vMoeVisionRotaryEmbedding):
487
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
488
+ init.copy_(module.inv_freq, inv_freq)
489
+
482
490
 
483
491
  class Glm4vMoeCausalLMOutputWithPast(Qwen3VLMoeCausalLMOutputWithPast):
484
492
  pass
485
493
 
486
494
 
495
+ class Glm4vMoeVisionRotaryEmbedding(Glm4vVisionRotaryEmbedding):
496
+ pass
497
+
498
+
487
499
  @auto_docstring
488
500
  class Glm4vMoeVisionModel(Glm4vVisionModel):
489
501
  pass
@@ -0,0 +1,30 @@
1
+ # coding=utf-8
2
+ # Copyright 2025 the HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import TYPE_CHECKING
17
+
18
+ from ...utils import _LazyModule
19
+ from ...utils.import_utils import define_import_structure
20
+
21
+
22
+ if TYPE_CHECKING:
23
+ from .configuration_glmasr import *
24
+ from .modeling_glmasr import *
25
+ from .processing_glmasr import *
26
+ else:
27
+ import sys
28
+
29
+ _file = globals()["__file__"]
30
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
@@ -0,0 +1,197 @@
1
+ # coding=utf-8
2
+ # Copyright 2025 the HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from ...configuration_utils import PreTrainedConfig
17
+ from ..auto import CONFIG_MAPPING, AutoConfig
18
+
19
+
20
+ class GlmAsrEncoderConfig(PreTrainedConfig):
21
+ r"""
22
+ This is the configuration class to store the configuration of a [`GlmAsrEncoder`]. It is used to instantiate a
23
+ glmasr audio encoder according to the specified arguments, defining the model architecture. Instantiating a
24
+ configuration with the defaults will yield a similar configuration to that of the audio encoder of the glmasr
25
+ architecture.
26
+
27
+ e.g. [zai-org/GLM-ASR-Nano-2512](https://huggingface.co/zai-org/GLM-ASR-Nano-2512)
28
+
29
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
30
+ documentation from [`PreTrainedConfig`] for more information.
31
+
32
+ Args:
33
+ hidden_size (`int`, *optional*, defaults to 1280):
34
+ Dimensionality of the hidden representations.
35
+ intermediate_size (`int`, *optional*, defaults to 5120):
36
+ Dimension of the MLP representations.
37
+ num_hidden_layers (`int`, *optional*, defaults to 32):
38
+ Number of hidden layers in the Transformer encoder.
39
+ num_attention_heads (`int`, *optional*, defaults to 20):
40
+ Number of attention heads for each attention layer in the Transformer encoder.
41
+ num_key_value_heads (`int`, *optional*):
42
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
43
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
44
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
45
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
46
+ by meanpooling all the original heads within that group. For more details, check out [this
47
+ paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
48
+ `num_attention_heads`.
49
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
50
+ The non-linear activation function (function or string) in the encoder and pooler.
51
+ max_position_embeddings (`int`, *optional*, defaults to 1500):
52
+ The maximum sequence length that this model might ever be used with.
53
+ initializer_range (`float`, *optional*, defaults to 0.02):
54
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
55
+ rope_parameters (`RopeParameters`, *optional*):
56
+ Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
57
+ a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
58
+ with longer `max_position_embeddings`.
59
+ attention_dropout (`float`, *optional*, defaults to 0.0):
60
+ The dropout ratio for the attention probabilities.
61
+ num_mel_bins (`int`, *optional*, defaults to 128):
62
+ Number of mel features used per input features. Should correspond to the value used in the
63
+ `GlmAsrProcessor` class.
64
+
65
+ ```python
66
+ >>> from transformers import GlmAsrEncoderConfig, GlmAsrEncoder
67
+
68
+ >>> # Initializing a GlmAsrEncoderConfig
69
+ >>> configuration = GlmAsrEncoderConfig()
70
+
71
+ >>> # Initializing a GlmAsrEncoder (with random weights)
72
+ >>> model = GlmAsrEncoder(configuration)
73
+
74
+ >>> # Accessing the model configuration
75
+ >>> configuration = model.config
76
+ ```"""
77
+
78
+ model_type = "glmasr_encoder"
79
+
80
+ def __init__(
81
+ self,
82
+ hidden_size=1280,
83
+ intermediate_size=5120,
84
+ num_hidden_layers=32,
85
+ num_attention_heads=20,
86
+ num_key_value_heads=None,
87
+ hidden_act="gelu",
88
+ max_position_embeddings=1500,
89
+ initializer_range=0.02,
90
+ rope_parameters=None,
91
+ attention_dropout=0.0,
92
+ num_mel_bins=128,
93
+ **kwargs,
94
+ ):
95
+ self.hidden_size = hidden_size
96
+ self.intermediate_size = intermediate_size
97
+ self.num_hidden_layers = num_hidden_layers
98
+ self.num_attention_heads = num_attention_heads
99
+ if num_key_value_heads is None:
100
+ num_key_value_heads = num_attention_heads
101
+ self.num_key_value_heads = num_key_value_heads
102
+ self.hidden_act = hidden_act
103
+ self.initializer_range = initializer_range
104
+ self.head_dim = hidden_size // num_attention_heads
105
+ self.max_position_embeddings = max_position_embeddings
106
+ self.rope_parameters = rope_parameters
107
+ self.attention_dropout = attention_dropout
108
+ self.num_mel_bins = num_mel_bins
109
+
110
+ kwargs.setdefault("partial_rotary_factor", 0.5)
111
+ super().__init__(**kwargs)
112
+
113
+
114
+ class GlmAsrConfig(PreTrainedConfig):
115
+ r"""
116
+ This is the configuration class to store the configuration of a [`GlmAsrForConditionalGeneration`]. It is used to instantiate an
117
+ glmasr model according to the specified arguments, defining the model architecture. Instantiating a configuration
118
+ with the defaults will yield a similar configuration to that of the glmasr-Mini-3B.
119
+
120
+ e.g. [zai-org/GLM-ASR-Nano-2512](https://huggingface.co/zai-org/GLM-ASR-Nano-2512)
121
+
122
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
123
+ documentation from [`PreTrainedConfig`] for more information.
124
+
125
+ Args:
126
+ audio_config (`Union[AutoConfig, dict]`, *optional*):
127
+ The config object or dictionary of the audio encoder.
128
+ text_config (`Union[AutoConfig, dict]`, *optional*):
129
+ The config object or dictionary of the text model.
130
+ audio_token_id (`int`, *optional*, defaults to 59260):
131
+ The audio token index to encode the audio prompt.
132
+ projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
133
+ The activation function (function or string) in the multi-modal projector.
134
+
135
+ ```python
136
+ >>> from transformers import GlmAsrForConditionalGeneration, GlmAsrConfig
137
+
138
+ >>> # Initializing a glmasr configuration
139
+ >>> configuration = GlmAsrConfig()
140
+
141
+ >>> # Initializing a GLM-ASR-Nano-2512 model with random weights
142
+ >>> model = GlmAsrForConditionalGeneration(configuration)
143
+
144
+ >>> # Accessing the model configuration
145
+ >>> configuration = model.config
146
+ ```"""
147
+
148
+ model_type = "glmasr"
149
+ sub_configs = {"text_config": AutoConfig, "audio_config": AutoConfig}
150
+
151
+ _default_text_config_kwargs = {
152
+ "vocab_size": 59264,
153
+ "hidden_size": 2048,
154
+ "intermediate_size": 6144,
155
+ "num_hidden_layers": 28,
156
+ "num_attention_heads": 16,
157
+ "num_key_value_heads": 4,
158
+ "max_position_embeddings": 8192,
159
+ "rms_norm_eps": 1e-05,
160
+ "use_cache": True,
161
+ "eos_token_id": [59246, 59253, 59255],
162
+ "rope_parameters": {"rope_theta": 10000.0, "rope_type": "default"},
163
+ }
164
+
165
+ def __init__(
166
+ self,
167
+ audio_config=None,
168
+ text_config=None,
169
+ audio_token_id=59260,
170
+ projector_hidden_act="gelu",
171
+ **kwargs,
172
+ ):
173
+ if isinstance(audio_config, dict):
174
+ audio_config["model_type"] = audio_config.get("model_type", "glmasr_encoder")
175
+ audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config)
176
+ elif audio_config is None:
177
+ audio_config = CONFIG_MAPPING["glmasr_encoder"]()
178
+ self.audio_config = audio_config
179
+
180
+ if isinstance(text_config, dict):
181
+ text_config["model_type"] = text_config.get("model_type", "llama")
182
+ text_config = CONFIG_MAPPING[text_config["model_type"]](
183
+ **{**self._default_text_config_kwargs, **text_config}
184
+ )
185
+ elif text_config is None:
186
+ text_config = CONFIG_MAPPING["llama"](**self._default_text_config_kwargs)
187
+ self.text_config = text_config
188
+
189
+ self.vocab_size = text_config.vocab_size
190
+ self.hidden_size = text_config.hidden_size
191
+ self.audio_token_id = audio_token_id
192
+ self.projector_hidden_act = projector_hidden_act
193
+
194
+ super().__init__(**kwargs)
195
+
196
+
197
+ __all__ = ["GlmAsrEncoderConfig", "GlmAsrConfig"]