transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (671) hide show
  1. transformers/__init__.py +20 -1
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/configuration_utils.py +114 -70
  6. transformers/conversion_mapping.py +68 -5
  7. transformers/core_model_loading.py +201 -35
  8. transformers/dependency_versions_table.py +1 -1
  9. transformers/feature_extraction_utils.py +54 -22
  10. transformers/generation/candidate_generator.py +79 -31
  11. transformers/generation/configuration_utils.py +162 -122
  12. transformers/generation/continuous_batching/cache.py +47 -18
  13. transformers/generation/continuous_batching/cache_manager.py +131 -34
  14. transformers/generation/continuous_batching/continuous_api.py +101 -64
  15. transformers/generation/continuous_batching/requests.py +28 -1
  16. transformers/generation/continuous_batching/scheduler.py +11 -4
  17. transformers/generation/stopping_criteria.py +1 -1
  18. transformers/generation/utils.py +108 -110
  19. transformers/generation/watermarking.py +8 -5
  20. transformers/image_processing_base.py +2 -12
  21. transformers/image_processing_utils_fast.py +15 -4
  22. transformers/initialization.py +37 -0
  23. transformers/integrations/__init__.py +12 -0
  24. transformers/integrations/accelerate.py +44 -111
  25. transformers/integrations/aqlm.py +3 -5
  26. transformers/integrations/awq.py +2 -5
  27. transformers/integrations/bitnet.py +5 -8
  28. transformers/integrations/bitsandbytes.py +16 -15
  29. transformers/integrations/deepspeed.py +18 -3
  30. transformers/integrations/eetq.py +3 -5
  31. transformers/integrations/fbgemm_fp8.py +1 -1
  32. transformers/integrations/finegrained_fp8.py +6 -16
  33. transformers/integrations/flash_attention.py +2 -2
  34. transformers/integrations/higgs.py +2 -5
  35. transformers/integrations/hub_kernels.py +23 -5
  36. transformers/integrations/integration_utils.py +35 -0
  37. transformers/integrations/mistral.py +12 -0
  38. transformers/integrations/moe.py +240 -0
  39. transformers/integrations/mxfp4.py +4 -10
  40. transformers/integrations/peft.py +5 -0
  41. transformers/integrations/quanto.py +5 -2
  42. transformers/integrations/spqr.py +3 -5
  43. transformers/integrations/tensor_parallel.py +167 -221
  44. transformers/integrations/vptq.py +3 -5
  45. transformers/modeling_gguf_pytorch_utils.py +66 -19
  46. transformers/modeling_rope_utils.py +78 -81
  47. transformers/modeling_utils.py +583 -503
  48. transformers/models/__init__.py +19 -0
  49. transformers/models/afmoe/modeling_afmoe.py +7 -16
  50. transformers/models/afmoe/modular_afmoe.py +5 -13
  51. transformers/models/aimv2/modeling_aimv2.py +4 -0
  52. transformers/models/aimv2/modular_aimv2.py +4 -0
  53. transformers/models/albert/modeling_albert.py +3 -0
  54. transformers/models/align/modeling_align.py +12 -6
  55. transformers/models/altclip/modeling_altclip.py +7 -3
  56. transformers/models/apertus/modeling_apertus.py +4 -2
  57. transformers/models/apertus/modular_apertus.py +4 -1
  58. transformers/models/arcee/modeling_arcee.py +1 -1
  59. transformers/models/aria/modeling_aria.py +8 -4
  60. transformers/models/aria/modular_aria.py +7 -3
  61. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  62. transformers/models/auto/auto_factory.py +1 -1
  63. transformers/models/auto/configuration_auto.py +27 -0
  64. transformers/models/auto/feature_extraction_auto.py +7 -3
  65. transformers/models/auto/image_processing_auto.py +4 -2
  66. transformers/models/auto/modeling_auto.py +31 -0
  67. transformers/models/auto/processing_auto.py +4 -0
  68. transformers/models/auto/tokenization_auto.py +132 -153
  69. transformers/models/auto/video_processing_auto.py +5 -2
  70. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  71. transformers/models/bamba/modeling_bamba.py +18 -19
  72. transformers/models/bamba/modular_bamba.py +17 -16
  73. transformers/models/bark/modeling_bark.py +9 -0
  74. transformers/models/bart/configuration_bart.py +0 -1
  75. transformers/models/bart/modeling_bart.py +7 -0
  76. transformers/models/beit/image_processing_beit_fast.py +0 -1
  77. transformers/models/bert/modeling_bert.py +3 -0
  78. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  79. transformers/models/big_bird/modeling_big_bird.py +3 -0
  80. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
  81. transformers/models/bit/modeling_bit.py +5 -1
  82. transformers/models/bitnet/modeling_bitnet.py +1 -1
  83. transformers/models/blenderbot/modeling_blenderbot.py +7 -0
  84. transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
  85. transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
  86. transformers/models/blip/modeling_blip.py +2 -0
  87. transformers/models/blip/modeling_blip_text.py +8 -0
  88. transformers/models/blip_2/modeling_blip_2.py +2 -0
  89. transformers/models/bloom/modeling_bloom.py +13 -44
  90. transformers/models/blt/modeling_blt.py +162 -2
  91. transformers/models/blt/modular_blt.py +168 -3
  92. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  93. transformers/models/bridgetower/modeling_bridgetower.py +6 -0
  94. transformers/models/bros/modeling_bros.py +8 -0
  95. transformers/models/camembert/modeling_camembert.py +109 -106
  96. transformers/models/canine/modeling_canine.py +6 -0
  97. transformers/models/canine/tokenization_canine.py +2 -0
  98. transformers/models/chameleon/modeling_chameleon.py +9 -4
  99. transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
  100. transformers/models/clap/feature_extraction_clap.py +2 -2
  101. transformers/models/clap/modeling_clap.py +25 -15
  102. transformers/models/clip/modeling_clip.py +2 -0
  103. transformers/models/clipseg/modeling_clipseg.py +4 -0
  104. transformers/models/clvp/modeling_clvp.py +14 -3
  105. transformers/models/code_llama/tokenization_code_llama.py +1 -1
  106. transformers/models/codegen/modeling_codegen.py +13 -4
  107. transformers/models/cohere/modeling_cohere.py +1 -1
  108. transformers/models/cohere2/modeling_cohere2.py +1 -1
  109. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
  110. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  111. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  112. transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
  113. transformers/models/convbert/modeling_convbert.py +3 -0
  114. transformers/models/convnext/image_processing_convnext.py +2 -2
  115. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  116. transformers/models/csm/generation_csm.py +19 -22
  117. transformers/models/csm/modeling_csm.py +3 -1
  118. transformers/models/csm/modular_csm.py +2 -0
  119. transformers/models/ctrl/modeling_ctrl.py +14 -2
  120. transformers/models/cvt/modeling_cvt.py +5 -1
  121. transformers/models/cwm/modeling_cwm.py +1 -1
  122. transformers/models/d_fine/configuration_d_fine.py +3 -4
  123. transformers/models/d_fine/modeling_d_fine.py +46 -39
  124. transformers/models/d_fine/modular_d_fine.py +15 -4
  125. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  126. transformers/models/dab_detr/modeling_dab_detr.py +1 -1
  127. transformers/models/dac/modeling_dac.py +4 -4
  128. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  129. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  130. transformers/models/dbrx/configuration_dbrx.py +9 -1
  131. transformers/models/dbrx/modeling_dbrx.py +1 -1
  132. transformers/models/deberta/modeling_deberta.py +2 -0
  133. transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
  134. transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
  135. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
  136. transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
  137. transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
  138. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
  139. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  140. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  141. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  142. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  143. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  144. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  145. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  146. transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
  147. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  148. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  149. transformers/models/detr/configuration_detr.py +1 -1
  150. transformers/models/detr/modeling_detr.py +8 -1
  151. transformers/models/dia/generation_dia.py +3 -10
  152. transformers/models/dia/modeling_dia.py +12 -1
  153. transformers/models/dia/modular_dia.py +11 -0
  154. transformers/models/dia/processing_dia.py +1 -1
  155. transformers/models/diffllama/modeling_diffllama.py +3 -3
  156. transformers/models/diffllama/modular_diffllama.py +2 -2
  157. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  158. transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
  159. transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
  160. transformers/models/distilbert/modeling_distilbert.py +11 -9
  161. transformers/models/doge/modeling_doge.py +1 -1
  162. transformers/models/donut/image_processing_donut_fast.py +0 -1
  163. transformers/models/donut/modeling_donut_swin.py +16 -12
  164. transformers/models/dots1/modeling_dots1.py +14 -5
  165. transformers/models/dpt/configuration_dpt.py +1 -1
  166. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  167. transformers/models/dpt/modular_dpt.py +1 -2
  168. transformers/models/edgetam/configuration_edgetam.py +1 -1
  169. transformers/models/edgetam/modeling_edgetam.py +5 -2
  170. transformers/models/edgetam/modular_edgetam.py +15 -14
  171. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
  172. transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
  173. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  174. transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
  175. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  176. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  177. transformers/models/efficientnet/modeling_efficientnet.py +5 -1
  178. transformers/models/electra/modeling_electra.py +7 -0
  179. transformers/models/emu3/modeling_emu3.py +8 -2
  180. transformers/models/emu3/modular_emu3.py +7 -1
  181. transformers/models/encodec/modeling_encodec.py +14 -0
  182. transformers/models/eomt/image_processing_eomt_fast.py +46 -14
  183. transformers/models/eomt/modeling_eomt.py +7 -0
  184. transformers/models/eomt/modular_eomt.py +7 -0
  185. transformers/models/ernie/modeling_ernie.py +6 -0
  186. transformers/models/ernie/modular_ernie.py +6 -0
  187. transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
  188. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
  189. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
  190. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  191. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  192. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  193. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  194. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  195. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  196. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  197. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  198. transformers/models/esm/modeling_esm.py +6 -0
  199. transformers/models/esm/modeling_esmfold.py +6 -1
  200. transformers/models/evolla/modeling_evolla.py +9 -1
  201. transformers/models/evolla/modular_evolla.py +8 -0
  202. transformers/models/exaone4/modeling_exaone4.py +1 -1
  203. transformers/models/falcon/modeling_falcon.py +3 -3
  204. transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
  205. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  206. transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
  207. transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
  208. transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
  209. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
  210. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  211. transformers/models/flaubert/modeling_flaubert.py +14 -15
  212. transformers/models/flava/image_processing_flava_fast.py +0 -2
  213. transformers/models/flava/modeling_flava.py +4 -1
  214. transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
  215. transformers/models/florence2/modeling_florence2.py +20 -3
  216. transformers/models/florence2/modular_florence2.py +13 -0
  217. transformers/models/fnet/modeling_fnet.py +7 -0
  218. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  219. transformers/models/fuyu/modeling_fuyu.py +3 -1
  220. transformers/models/fuyu/processing_fuyu.py +16 -0
  221. transformers/models/gemma/modeling_gemma.py +10 -12
  222. transformers/models/gemma/modular_gemma.py +9 -11
  223. transformers/models/gemma2/modeling_gemma2.py +1 -1
  224. transformers/models/gemma2/modular_gemma2.py +1 -1
  225. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  226. transformers/models/gemma3/modeling_gemma3.py +28 -7
  227. transformers/models/gemma3/modular_gemma3.py +26 -6
  228. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  229. transformers/models/gemma3n/modeling_gemma3n.py +47 -9
  230. transformers/models/gemma3n/modular_gemma3n.py +51 -9
  231. transformers/models/git/modeling_git.py +181 -126
  232. transformers/models/glm/modeling_glm.py +1 -1
  233. transformers/models/glm4/modeling_glm4.py +1 -1
  234. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  235. transformers/models/glm46v/modeling_glm46v.py +3 -1
  236. transformers/models/glm46v/modular_glm46v.py +3 -0
  237. transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
  238. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  239. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  240. transformers/models/glm4v/modeling_glm4v.py +15 -5
  241. transformers/models/glm4v/modular_glm4v.py +11 -3
  242. transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
  243. transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
  244. transformers/models/glmasr/__init__.py +30 -0
  245. transformers/models/glmasr/configuration_glmasr.py +197 -0
  246. transformers/models/glmasr/modeling_glmasr.py +512 -0
  247. transformers/models/glmasr/modular_glmasr.py +433 -0
  248. transformers/models/glmasr/processing_glmasr.py +332 -0
  249. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  250. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  251. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  252. transformers/models/gpt2/modeling_gpt2.py +8 -5
  253. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
  254. transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
  255. transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
  256. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
  257. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  258. transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
  259. transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
  260. transformers/models/gptj/modeling_gptj.py +15 -6
  261. transformers/models/granite/modeling_granite.py +1 -1
  262. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  263. transformers/models/granitemoe/modeling_granitemoe.py +2 -3
  264. transformers/models/granitemoe/modular_granitemoe.py +1 -2
  265. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  266. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
  267. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  268. transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
  269. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  270. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
  271. transformers/models/groupvit/modeling_groupvit.py +6 -1
  272. transformers/models/helium/modeling_helium.py +1 -1
  273. transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
  274. transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
  275. transformers/models/hubert/modeling_hubert.py +4 -0
  276. transformers/models/hubert/modular_hubert.py +4 -0
  277. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
  278. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  279. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  280. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
  281. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  282. transformers/models/ibert/modeling_ibert.py +16 -0
  283. transformers/models/idefics/modeling_idefics.py +10 -0
  284. transformers/models/idefics2/modeling_idefics2.py +7 -1
  285. transformers/models/idefics3/modeling_idefics3.py +5 -1
  286. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  287. transformers/models/imagegpt/modeling_imagegpt.py +9 -2
  288. transformers/models/instructblip/modeling_instructblip.py +2 -0
  289. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  290. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  291. transformers/models/internvl/modeling_internvl.py +11 -8
  292. transformers/models/internvl/modular_internvl.py +5 -9
  293. transformers/models/internvl/video_processing_internvl.py +0 -1
  294. transformers/models/jais2/__init__.py +27 -0
  295. transformers/models/jais2/configuration_jais2.py +152 -0
  296. transformers/models/jais2/modeling_jais2.py +486 -0
  297. transformers/models/jais2/modular_jais2.py +196 -0
  298. transformers/models/jamba/modeling_jamba.py +24 -19
  299. transformers/models/jamba/modular_jamba.py +17 -17
  300. transformers/models/janus/image_processing_janus_fast.py +0 -1
  301. transformers/models/janus/modeling_janus.py +15 -7
  302. transformers/models/janus/modular_janus.py +16 -7
  303. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  304. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  305. transformers/models/kosmos2/modeling_kosmos2.py +14 -2
  306. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  307. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  308. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
  309. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  310. transformers/models/lasr/configuration_lasr.py +4 -0
  311. transformers/models/lasr/modeling_lasr.py +3 -2
  312. transformers/models/lasr/modular_lasr.py +8 -1
  313. transformers/models/lasr/processing_lasr.py +0 -2
  314. transformers/models/layoutlm/modeling_layoutlm.py +5 -3
  315. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  316. transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
  317. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
  318. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  319. transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
  320. transformers/models/led/modeling_led.py +6 -0
  321. transformers/models/levit/modeling_levit.py +18 -0
  322. transformers/models/lfm2/modeling_lfm2.py +1 -1
  323. transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
  324. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  325. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  326. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  327. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  328. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  329. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  330. transformers/models/lilt/modeling_lilt.py +19 -15
  331. transformers/models/llama/modeling_llama.py +1 -1
  332. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  333. transformers/models/llama4/modeling_llama4.py +8 -4
  334. transformers/models/llava/image_processing_llava_fast.py +0 -1
  335. transformers/models/llava/modeling_llava.py +12 -7
  336. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  337. transformers/models/llava_next/modeling_llava_next.py +7 -3
  338. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  339. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  340. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  341. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  342. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  343. transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
  344. transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
  345. transformers/models/longt5/modeling_longt5.py +0 -4
  346. transformers/models/m2m_100/modeling_m2m_100.py +10 -0
  347. transformers/models/mamba/modeling_mamba.py +2 -1
  348. transformers/models/mamba2/modeling_mamba2.py +24 -23
  349. transformers/models/marian/configuration_marian.py +1 -1
  350. transformers/models/marian/modeling_marian.py +3 -0
  351. transformers/models/markuplm/modeling_markuplm.py +5 -8
  352. transformers/models/mask2former/configuration_mask2former.py +3 -3
  353. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  354. transformers/models/mask2former/modeling_mask2former.py +9 -0
  355. transformers/models/maskformer/configuration_maskformer.py +3 -3
  356. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  357. transformers/models/maskformer/modeling_maskformer.py +9 -1
  358. transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
  359. transformers/models/mbart/configuration_mbart.py +1 -0
  360. transformers/models/mbart/modeling_mbart.py +7 -0
  361. transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
  362. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  363. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  364. transformers/models/mimi/modeling_mimi.py +25 -4
  365. transformers/models/minimax/modeling_minimax.py +16 -3
  366. transformers/models/minimax/modular_minimax.py +12 -1
  367. transformers/models/ministral/modeling_ministral.py +1 -1
  368. transformers/models/ministral3/modeling_ministral3.py +1 -1
  369. transformers/models/mistral/modeling_mistral.py +1 -1
  370. transformers/models/mistral3/modeling_mistral3.py +10 -4
  371. transformers/models/mistral3/modular_mistral3.py +3 -1
  372. transformers/models/mixtral/modeling_mixtral.py +12 -4
  373. transformers/models/mixtral/modular_mixtral.py +6 -2
  374. transformers/models/mlcd/modeling_mlcd.py +6 -0
  375. transformers/models/mlcd/modular_mlcd.py +4 -0
  376. transformers/models/mllama/modeling_mllama.py +13 -2
  377. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  378. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
  379. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  380. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  381. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  382. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  383. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  384. transformers/models/mobilevit/modeling_mobilevit.py +4 -0
  385. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
  386. transformers/models/modernbert/modeling_modernbert.py +12 -1
  387. transformers/models/modernbert/modular_modernbert.py +12 -1
  388. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
  389. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
  390. transformers/models/moonshine/modeling_moonshine.py +1 -1
  391. transformers/models/moshi/modeling_moshi.py +21 -51
  392. transformers/models/mpnet/modeling_mpnet.py +2 -0
  393. transformers/models/mra/modeling_mra.py +4 -1
  394. transformers/models/mt5/configuration_mt5.py +2 -3
  395. transformers/models/mt5/modeling_mt5.py +0 -10
  396. transformers/models/musicgen/modeling_musicgen.py +5 -9
  397. transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
  398. transformers/models/mvp/modeling_mvp.py +7 -0
  399. transformers/models/nanochat/modeling_nanochat.py +1 -1
  400. transformers/models/nemotron/modeling_nemotron.py +3 -3
  401. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  402. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  403. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  404. transformers/models/nougat/tokenization_nougat.py +11 -16
  405. transformers/models/nystromformer/modeling_nystromformer.py +7 -0
  406. transformers/models/olmo/modeling_olmo.py +1 -1
  407. transformers/models/olmo2/modeling_olmo2.py +1 -1
  408. transformers/models/olmo3/modeling_olmo3.py +1 -1
  409. transformers/models/olmoe/modeling_olmoe.py +12 -4
  410. transformers/models/olmoe/modular_olmoe.py +4 -2
  411. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  412. transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
  413. transformers/models/oneformer/configuration_oneformer.py +3 -3
  414. transformers/models/oneformer/modeling_oneformer.py +7 -38
  415. transformers/models/openai/modeling_openai.py +12 -0
  416. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  417. transformers/models/ovis2/modeling_ovis2.py +15 -3
  418. transformers/models/ovis2/modular_ovis2.py +8 -0
  419. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  420. transformers/models/owlv2/modeling_owlv2.py +7 -3
  421. transformers/models/owlv2/modular_owlv2.py +0 -2
  422. transformers/models/owlvit/modeling_owlvit.py +7 -3
  423. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
  424. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
  425. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
  426. transformers/models/paligemma/modeling_paligemma.py +25 -17
  427. transformers/models/parakeet/modeling_parakeet.py +5 -0
  428. transformers/models/parakeet/modular_parakeet.py +5 -0
  429. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  430. transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
  431. transformers/models/patchtst/modeling_patchtst.py +5 -4
  432. transformers/models/pe_audio/__init__.py +30 -0
  433. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  434. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  435. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  436. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  437. transformers/models/pe_audio/processing_pe_audio.py +24 -0
  438. transformers/models/pe_audio_video/__init__.py +29 -0
  439. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  440. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  441. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  442. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  443. transformers/models/pe_video/__init__.py +30 -0
  444. transformers/models/pe_video/configuration_pe_video.py +211 -0
  445. transformers/models/pe_video/modeling_pe_video.py +636 -0
  446. transformers/models/pe_video/modular_pe_video.py +219 -0
  447. transformers/models/pe_video/processing_pe_video.py +10 -0
  448. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  449. transformers/models/pegasus/configuration_pegasus.py +1 -0
  450. transformers/models/pegasus/modeling_pegasus.py +3 -0
  451. transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
  452. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  453. transformers/models/perceiver/modeling_perceiver.py +5 -1
  454. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  455. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  456. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  457. transformers/models/persimmon/modeling_persimmon.py +1 -1
  458. transformers/models/phi/modeling_phi.py +1 -1
  459. transformers/models/phi3/modeling_phi3.py +1 -1
  460. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
  461. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
  462. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  463. transformers/models/phimoe/modeling_phimoe.py +12 -4
  464. transformers/models/phimoe/modular_phimoe.py +1 -1
  465. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  466. transformers/models/pixio/__init__.py +30 -0
  467. transformers/models/pixio/configuration_pixio.py +151 -0
  468. transformers/models/pixio/modeling_pixio.py +507 -0
  469. transformers/models/pixio/modular_pixio.py +404 -0
  470. transformers/models/pixtral/modeling_pixtral.py +1 -1
  471. transformers/models/pixtral/processing_pixtral.py +3 -1
  472. transformers/models/plbart/configuration_plbart.py +1 -0
  473. transformers/models/plbart/modeling_plbart.py +7 -0
  474. transformers/models/plbart/modular_plbart.py +6 -0
  475. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  476. transformers/models/poolformer/modeling_poolformer.py +11 -1
  477. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  478. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  479. transformers/models/prophetnet/modeling_prophetnet.py +2 -1
  480. transformers/models/qwen2/modeling_qwen2.py +1 -1
  481. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
  482. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
  483. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
  484. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
  485. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
  486. transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
  487. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  488. transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
  489. transformers/models/qwen3/modeling_qwen3.py +1 -1
  490. transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
  491. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
  492. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  493. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
  494. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
  495. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  496. transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
  497. transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
  498. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  499. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
  500. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
  501. transformers/models/rag/configuration_rag.py +0 -8
  502. transformers/models/rag/modeling_rag.py +7 -9
  503. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
  504. transformers/models/reformer/modeling_reformer.py +9 -1
  505. transformers/models/regnet/modeling_regnet.py +4 -0
  506. transformers/models/rembert/modeling_rembert.py +7 -1
  507. transformers/models/resnet/modeling_resnet.py +8 -3
  508. transformers/models/roberta/modeling_roberta.py +3 -0
  509. transformers/models/roberta/modular_roberta.py +3 -0
  510. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  511. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  512. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  513. transformers/models/rt_detr/modeling_rt_detr.py +4 -0
  514. transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
  515. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  516. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
  517. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  518. transformers/models/rwkv/modeling_rwkv.py +1 -1
  519. transformers/models/sam/configuration_sam.py +1 -0
  520. transformers/models/sam/image_processing_sam_fast.py +0 -1
  521. transformers/models/sam/modeling_sam.py +4 -1
  522. transformers/models/sam2/configuration_sam2.py +1 -1
  523. transformers/models/sam2/modeling_sam2.py +5 -1
  524. transformers/models/sam2/modular_sam2.py +5 -1
  525. transformers/models/sam2_video/modeling_sam2_video.py +51 -43
  526. transformers/models/sam2_video/modular_sam2_video.py +31 -18
  527. transformers/models/sam3/configuration_sam3.py +21 -1
  528. transformers/models/sam3/modeling_sam3.py +23 -0
  529. transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
  530. transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
  531. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  532. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
  533. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  534. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  535. transformers/models/sam3_video/modeling_sam3_video.py +3 -3
  536. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  537. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  538. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  539. transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
  540. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
  541. transformers/models/seed_oss/modeling_seed_oss.py +1 -1
  542. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  543. transformers/models/segformer/modeling_segformer.py +2 -2
  544. transformers/models/segformer/modular_segformer.py +0 -1
  545. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  546. transformers/models/siglip/modeling_siglip.py +24 -2
  547. transformers/models/siglip2/modeling_siglip2.py +63 -41
  548. transformers/models/smollm3/modeling_smollm3.py +1 -1
  549. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  550. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  551. transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
  552. transformers/models/speecht5/modeling_speecht5.py +28 -0
  553. transformers/models/splinter/modeling_splinter.py +9 -3
  554. transformers/models/squeezebert/modeling_squeezebert.py +2 -0
  555. transformers/models/stablelm/modeling_stablelm.py +1 -1
  556. transformers/models/starcoder2/modeling_starcoder2.py +1 -1
  557. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  558. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  559. transformers/models/swiftformer/modeling_swiftformer.py +4 -0
  560. transformers/models/swin/modeling_swin.py +16 -12
  561. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  562. transformers/models/swin2sr/modeling_swin2sr.py +49 -33
  563. transformers/models/swinv2/modeling_swinv2.py +41 -33
  564. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  565. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  566. transformers/models/t5/configuration_t5.py +7 -1
  567. transformers/models/t5/modeling_t5.py +1 -7
  568. transformers/models/t5gemma/modeling_t5gemma.py +1 -1
  569. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  570. transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
  571. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  572. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  573. transformers/models/table_transformer/modeling_table_transformer.py +1 -1
  574. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  575. transformers/models/timesfm/modeling_timesfm.py +12 -0
  576. transformers/models/timesfm/modular_timesfm.py +12 -0
  577. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  578. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  579. transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
  580. transformers/models/trocr/modeling_trocr.py +1 -2
  581. transformers/models/tvp/configuration_tvp.py +5 -1
  582. transformers/models/tvp/modeling_tvp.py +4 -4
  583. transformers/models/udop/configuration_udop.py +1 -0
  584. transformers/models/udop/modeling_udop.py +3 -7
  585. transformers/models/umt5/configuration_umt5.py +2 -2
  586. transformers/models/umt5/modeling_umt5.py +0 -6
  587. transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
  588. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  589. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  590. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  591. transformers/models/video_llava/modeling_video_llava.py +7 -3
  592. transformers/models/vilt/configuration_vilt.py +2 -2
  593. transformers/models/vilt/modeling_vilt.py +7 -0
  594. transformers/models/vipllava/modeling_vipllava.py +7 -3
  595. transformers/models/visual_bert/modeling_visual_bert.py +2 -0
  596. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  597. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  598. transformers/models/vitmatte/modeling_vitmatte.py +4 -0
  599. transformers/models/vitpose/configuration_vitpose.py +1 -1
  600. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  601. transformers/models/voxtral/modeling_voxtral.py +2 -2
  602. transformers/models/voxtral/modular_voxtral.py +2 -2
  603. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
  604. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
  605. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
  606. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  607. transformers/models/whisper/generation_whisper.py +1 -0
  608. transformers/models/whisper/modeling_whisper.py +5 -3
  609. transformers/models/x_clip/modeling_x_clip.py +2 -0
  610. transformers/models/xcodec/modeling_xcodec.py +5 -0
  611. transformers/models/xglm/modeling_xglm.py +10 -0
  612. transformers/models/xlm/modeling_xlm.py +13 -14
  613. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  614. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  615. transformers/models/xlnet/modeling_xlnet.py +3 -1
  616. transformers/models/xmod/modeling_xmod.py +3 -0
  617. transformers/models/yoso/modeling_yoso.py +4 -1
  618. transformers/models/zamba/modeling_zamba.py +2 -1
  619. transformers/models/zamba2/modeling_zamba2.py +3 -2
  620. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  621. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  622. transformers/models/zoedepth/modeling_zoedepth.py +7 -0
  623. transformers/pipelines/__init__.py +9 -6
  624. transformers/pipelines/automatic_speech_recognition.py +20 -12
  625. transformers/pipelines/base.py +1 -1
  626. transformers/pipelines/document_question_answering.py +1 -1
  627. transformers/pipelines/question_answering.py +1 -1
  628. transformers/pipelines/text_to_audio.py +2 -2
  629. transformers/processing_utils.py +127 -56
  630. transformers/quantizers/auto.py +2 -4
  631. transformers/quantizers/base.py +9 -64
  632. transformers/quantizers/quantizer_aqlm.py +1 -18
  633. transformers/quantizers/quantizer_auto_round.py +1 -10
  634. transformers/quantizers/quantizer_awq.py +3 -8
  635. transformers/quantizers/quantizer_bitnet.py +1 -6
  636. transformers/quantizers/quantizer_bnb_4bit.py +9 -49
  637. transformers/quantizers/quantizer_bnb_8bit.py +9 -19
  638. transformers/quantizers/quantizer_compressed_tensors.py +1 -4
  639. transformers/quantizers/quantizer_eetq.py +2 -12
  640. transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
  641. transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
  642. transformers/quantizers/quantizer_fp_quant.py +4 -4
  643. transformers/quantizers/quantizer_gptq.py +1 -4
  644. transformers/quantizers/quantizer_higgs.py +2 -6
  645. transformers/quantizers/quantizer_mxfp4.py +2 -28
  646. transformers/quantizers/quantizer_quanto.py +14 -14
  647. transformers/quantizers/quantizer_spqr.py +3 -8
  648. transformers/quantizers/quantizer_torchao.py +28 -124
  649. transformers/quantizers/quantizer_vptq.py +1 -10
  650. transformers/testing_utils.py +28 -12
  651. transformers/tokenization_mistral_common.py +3 -2
  652. transformers/tokenization_utils_base.py +3 -2
  653. transformers/tokenization_utils_tokenizers.py +25 -2
  654. transformers/trainer.py +24 -2
  655. transformers/trainer_callback.py +8 -0
  656. transformers/trainer_seq2seq.py +4 -0
  657. transformers/training_args.py +8 -10
  658. transformers/utils/__init__.py +4 -0
  659. transformers/utils/attention_visualizer.py +4 -4
  660. transformers/utils/auto_docstring.py +34 -25
  661. transformers/utils/generic.py +20 -0
  662. transformers/utils/import_utils.py +51 -9
  663. transformers/utils/kernel_config.py +71 -18
  664. transformers/utils/quantization_config.py +8 -8
  665. transformers/video_processing_utils.py +16 -12
  666. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
  667. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
  668. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
  669. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  670. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
  671. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -23,6 +23,7 @@ if TYPE_CHECKING:
23
23
  from .albert import *
24
24
  from .align import *
25
25
  from .altclip import *
26
+ from .apertus import *
26
27
  from .arcee import *
27
28
  from .aria import *
28
29
  from .audio_spectrogram_transformer import *
@@ -107,6 +108,7 @@ if TYPE_CHECKING:
107
108
  from .dinov3_vit import *
108
109
  from .distilbert import *
109
110
  from .dit import *
111
+ from .doge import *
110
112
  from .donut import *
111
113
  from .dots1 import *
112
114
  from .dpr import *
@@ -119,7 +121,11 @@ if TYPE_CHECKING:
119
121
  from .emu3 import *
120
122
  from .encodec import *
121
123
  from .encoder_decoder import *
124
+ from .eomt import *
122
125
  from .ernie import *
126
+ from .ernie4_5 import *
127
+ from .ernie4_5_moe import *
128
+ from .ernie4_5_vl_moe import *
123
129
  from .esm import *
124
130
  from .evolla import *
125
131
  from .exaone4 import *
@@ -144,9 +150,11 @@ if TYPE_CHECKING:
144
150
  from .git import *
145
151
  from .glm import *
146
152
  from .glm4 import *
153
+ from .glm4_moe import *
147
154
  from .glm4v import *
148
155
  from .glm4v_moe import *
149
156
  from .glm46v import *
157
+ from .glmasr import *
150
158
  from .glpn import *
151
159
  from .got_ocr2 import *
152
160
  from .gpt2 import *
@@ -181,10 +189,12 @@ if TYPE_CHECKING:
181
189
  from .instructblip import *
182
190
  from .instructblipvideo import *
183
191
  from .internvl import *
192
+ from .jais2 import *
184
193
  from .jamba import *
185
194
  from .janus import *
186
195
  from .jetmoe import *
187
196
  from .kosmos2 import *
197
+ from .kosmos2_5 import *
188
198
  from .kyutai_speech_to_text import *
189
199
  from .lasr import *
190
200
  from .layoutlm import *
@@ -220,6 +230,7 @@ if TYPE_CHECKING:
220
230
  from .mbart50 import *
221
231
  from .megatron_bert import *
222
232
  from .megatron_gpt2 import *
233
+ from .metaclip_2 import *
223
234
  from .mgp_str import *
224
235
  from .mimi import *
225
236
  from .minimax import *
@@ -231,6 +242,7 @@ if TYPE_CHECKING:
231
242
  from .mlcd import *
232
243
  from .mllama import *
233
244
  from .mluke import *
245
+ from .mm_grounding_dino import *
234
246
  from .mobilebert import *
235
247
  from .mobilenet_v1 import *
236
248
  from .mobilenet_v2 import *
@@ -270,6 +282,9 @@ if TYPE_CHECKING:
270
282
  from .parakeet import *
271
283
  from .patchtsmixer import *
272
284
  from .patchtst import *
285
+ from .pe_audio import *
286
+ from .pe_audio_video import *
287
+ from .pe_video import *
273
288
  from .pegasus import *
274
289
  from .pegasus_x import *
275
290
  from .perceiver import *
@@ -281,6 +296,7 @@ if TYPE_CHECKING:
281
296
  from .phimoe import *
282
297
  from .phobert import *
283
298
  from .pix2struct import *
299
+ from .pixio import *
284
300
  from .pixtral import *
285
301
  from .plbart import *
286
302
  from .poolformer import *
@@ -317,8 +333,10 @@ if TYPE_CHECKING:
317
333
  from .sam import *
318
334
  from .sam2 import *
319
335
  from .sam2_video import *
336
+ from .sam3 import *
320
337
  from .sam3_tracker import *
321
338
  from .sam3_tracker_video import *
339
+ from .sam3_video import *
322
340
  from .sam_hq import *
323
341
  from .seamless_m4t import *
324
342
  from .seamless_m4t_v2 import *
@@ -330,6 +348,7 @@ if TYPE_CHECKING:
330
348
  from .shieldgemma2 import *
331
349
  from .siglip import *
332
350
  from .siglip2 import *
351
+ from .smollm3 import *
333
352
  from .smolvlm import *
334
353
  from .speech_encoder_decoder import *
335
354
  from .speech_to_text import *
@@ -25,11 +25,11 @@ from typing import Optional, Union
25
25
  import torch
26
26
  from torch import nn
27
27
 
28
+ from ... import initialization as init
28
29
  from ...activations import ACT2FN
29
30
  from ...cache_utils import Cache, DynamicCache
30
31
  from ...generation import GenerationMixin
31
- from ...integrations import use_kernel_func_from_hub, use_kernelized_func
32
- from ...integrations.hub_kernels import use_kernel_forward_from_hub
32
+ from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
33
33
  from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
34
34
  from ...modeling_layers import GradientCheckpointingLayer
35
35
  from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, MoeModelOutputWithPast
@@ -58,7 +58,7 @@ class AfmoeRotaryEmbedding(nn.Module):
58
58
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
59
59
 
60
60
  self.register_buffer("inv_freq", inv_freq, persistent=False)
61
- self.original_inv_freq = inv_freq
61
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
62
62
 
63
63
  @staticmethod
64
64
  def compute_default_rope_parameters(
@@ -531,20 +531,11 @@ class AfmoePreTrainedModel(PreTrainedModel):
531
531
 
532
532
  def _init_weights(self, module):
533
533
  """Initialize the weights"""
534
- if isinstance(module, nn.Linear):
535
- nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
536
- if module.bias is not None:
537
- nn.init.zeros_(module.bias)
538
- elif isinstance(module, nn.Embedding):
539
- nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
540
- if module.padding_idx is not None:
541
- nn.init.zeros_(module.weight[module.padding_idx])
542
- elif isinstance(module, AfmoeRMSNorm):
543
- nn.init.ones_(module.weight)
544
- elif isinstance(module, AfmoeTokenChoiceRouter):
545
- nn.init.zeros_(module.gate.weight)
534
+ super()._init_weights(module)
535
+ if isinstance(module, AfmoeTokenChoiceRouter):
536
+ init.zeros_(module.gate.weight)
546
537
  elif isinstance(module, AfmoeMoE):
547
- nn.init.zeros_(module.expert_bias)
538
+ init.zeros_(module.expert_bias)
548
539
 
549
540
 
550
541
  @auto_docstring
@@ -20,6 +20,7 @@ from typing import Optional
20
20
  import torch
21
21
  from torch import nn
22
22
 
23
+ from ... import initialization as init
23
24
  from ...cache_utils import Cache, DynamicCache
24
25
  from ...generation import GenerationMixin
25
26
  from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
@@ -350,20 +351,11 @@ class AfmoePreTrainedModel(PreTrainedModel):
350
351
 
351
352
  def _init_weights(self, module):
352
353
  """Initialize the weights"""
353
- if isinstance(module, nn.Linear):
354
- nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
355
- if module.bias is not None:
356
- nn.init.zeros_(module.bias)
357
- elif isinstance(module, nn.Embedding):
358
- nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
359
- if module.padding_idx is not None:
360
- nn.init.zeros_(module.weight[module.padding_idx])
361
- elif isinstance(module, AfmoeRMSNorm):
362
- nn.init.ones_(module.weight)
363
- elif isinstance(module, AfmoeTokenChoiceRouter):
364
- nn.init.zeros_(module.gate.weight)
354
+ super()._init_weights(module)
355
+ if isinstance(module, AfmoeTokenChoiceRouter):
356
+ init.zeros_(module.gate.weight)
365
357
  elif isinstance(module, AfmoeMoE):
366
- nn.init.zeros_(module.expert_bias)
358
+ init.zeros_(module.expert_bias)
367
359
 
368
360
 
369
361
  @auto_docstring
@@ -414,6 +414,10 @@ class Aimv2PreTrainedModel(PreTrainedModel):
414
414
  init.constant_(module.logit_scale, math.log(1 / 0.07))
415
415
  elif isinstance(module, Aimv2AttentionPoolingHead):
416
416
  init.normal_(module.cls_token, mean=0.0, std=self.config.initializer_range)
417
+ elif isinstance(module, Aimv2VisionEmbeddings):
418
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
419
+ elif isinstance(module, Aimv2TextEmbeddings):
420
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
417
421
 
418
422
 
419
423
  @auto_docstring(
@@ -457,6 +457,10 @@ class Aimv2PreTrainedModel(PreTrainedModel):
457
457
  init.constant_(module.logit_scale, math.log(1 / 0.07))
458
458
  elif isinstance(module, Aimv2AttentionPoolingHead):
459
459
  init.normal_(module.cls_token, mean=0.0, std=self.config.initializer_range)
460
+ elif isinstance(module, Aimv2VisionEmbeddings):
461
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
462
+ elif isinstance(module, Aimv2TextEmbeddings):
463
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
460
464
 
461
465
 
462
466
  @auto_docstring(
@@ -320,6 +320,9 @@ class AlbertPreTrainedModel(PreTrainedModel):
320
320
  init.ones_(module.weight)
321
321
  elif isinstance(module, AlbertMLMHead):
322
322
  init.zeros_(module.bias)
323
+ elif isinstance(module, AlbertEmbeddings):
324
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
325
+ init.zeros_(module.token_type_ids)
323
326
 
324
327
 
325
328
  @dataclass
@@ -781,9 +781,9 @@ class AlignTextEncoder(nn.Module):
781
781
  all_hidden_states = all_hidden_states + (hidden_states,)
782
782
 
783
783
  layer_outputs = layer_module(
784
- hidden_states=hidden_states,
785
- attention_mask=attention_mask,
786
- output_attentions=output_attentions,
784
+ hidden_states,
785
+ attention_mask,
786
+ output_attentions,
787
787
  **kwargs,
788
788
  )
789
789
 
@@ -844,6 +844,13 @@ class AlignPreTrainedModel(PreTrainedModel):
844
844
  if isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
845
845
  init.zeros_(module.bias)
846
846
  init.ones_(module.weight)
847
+ if getattr(module, "running_mean", None) is not None:
848
+ init.zeros_(module.running_mean)
849
+ init.ones_(module.running_var)
850
+ init.zeros_(module.num_batches_tracked)
851
+ elif isinstance(module, AlignTextEmbeddings):
852
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
853
+ init.zeros_(module.token_type_ids)
847
854
 
848
855
 
849
856
  @auto_docstring(
@@ -976,6 +983,8 @@ class AlignVisionModel(AlignPreTrainedModel):
976
983
  main_input_name = "pixel_values"
977
984
  input_modalities = ("image",)
978
985
  supports_gradient_checkpointing = False
986
+ _input_embed_layer = "convolution"
987
+ _no_split_modules = ["AlignVisionBlock"]
979
988
 
980
989
  def __init__(self, config: AlignVisionConfig):
981
990
  super().__init__(config)
@@ -994,9 +1003,6 @@ class AlignVisionModel(AlignPreTrainedModel):
994
1003
  # Initialize weights and apply final processing
995
1004
  self.post_init()
996
1005
 
997
- def get_input_embeddings(self) -> nn.Module:
998
- return self.vision_model.embeddings.convolution
999
-
1000
1006
  @can_return_tuple
1001
1007
  @auto_docstring
1002
1008
  def forward(
@@ -393,9 +393,9 @@ class AltRobertaEncoder(nn.Module):
393
393
  all_hidden_states = all_hidden_states + (hidden_states,)
394
394
 
395
395
  layer_outputs = layer_module(
396
- hidden_states=hidden_states,
397
- attention_mask=attention_mask,
398
- output_attentions=output_attentions,
396
+ hidden_states,
397
+ attention_mask,
398
+ output_attentions,
399
399
  **kwargs,
400
400
  )
401
401
 
@@ -780,6 +780,7 @@ class AltCLIPPreTrainedModel(PreTrainedModel):
780
780
  init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
781
781
  init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
782
782
  init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
783
+ init.copy_(module.position_ids, torch.arange(module.num_positions).expand((1, -1)))
783
784
  elif isinstance(module, AltCLIPAttention):
784
785
  factor = self.config.initializer_factor
785
786
  in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
@@ -815,6 +816,9 @@ class AltCLIPPreTrainedModel(PreTrainedModel):
815
816
  # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
816
817
  if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
817
818
  init.zeros_(module.weight[module.padding_idx])
819
+ elif isinstance(module, AltRobertaEmbeddings):
820
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
821
+ init.zeros_(module.token_type_ids)
818
822
 
819
823
 
820
824
  class AltCLIPVisionTransformer(nn.Module):
@@ -25,7 +25,7 @@ from typing import Optional, Union
25
25
  import torch
26
26
  from torch import nn
27
27
 
28
- from ...activations import ACT2FN
28
+ from ...activations import ACT2CLS, ACT2FN
29
29
  from ...cache_utils import Cache, DynamicCache
30
30
  from ...generation import GenerationMixin
31
31
  from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
@@ -49,6 +49,8 @@ class ApertusMLP(nn.Module):
49
49
  self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
50
50
  self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
51
51
  self.act_fn = ACT2FN[config.hidden_act]
52
+ if config.hidden_act == "xielu":
53
+ self.act_fn = ACT2CLS["xielu"](dtype=config.dtype)
52
54
 
53
55
  def forward(self, x):
54
56
  return self.down_proj(self.act_fn(self.up_proj(x)))
@@ -92,7 +94,7 @@ class ApertusRotaryEmbedding(nn.Module):
92
94
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
93
95
 
94
96
  self.register_buffer("inv_freq", inv_freq, persistent=False)
95
- self.original_inv_freq = inv_freq
97
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
96
98
 
97
99
  @staticmethod
98
100
  def compute_default_rope_parameters(
@@ -19,6 +19,7 @@ from typing import Optional
19
19
  import torch
20
20
  from torch import nn
21
21
 
22
+ from ...activations import ACT2CLS
22
23
  from ...cache_utils import Cache
23
24
  from ...configuration_utils import PreTrainedConfig
24
25
  from ...modeling_rope_utils import RopeParameters
@@ -192,9 +193,11 @@ class ApertusConfig(PreTrainedConfig):
192
193
 
193
194
  class ApertusMLP(NemotronMLP):
194
195
  def __init__(self, config):
195
- super().__init__()
196
+ super().__init__(config)
196
197
  self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
197
198
  self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
199
+ if config.hidden_act == "xielu":
200
+ self.act_fn = ACT2CLS["xielu"](dtype=config.dtype)
198
201
 
199
202
 
200
203
  class ApertusRMSNorm(LlamaRMSNorm):
@@ -99,7 +99,7 @@ class ArceeRotaryEmbedding(nn.Module):
99
99
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
100
100
 
101
101
  self.register_buffer("inv_freq", inv_freq, persistent=False)
102
- self.original_inv_freq = inv_freq
102
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
103
103
 
104
104
  @staticmethod
105
105
  def compute_default_rope_parameters(
@@ -636,7 +636,7 @@ class AriaTextRotaryEmbedding(nn.Module):
636
636
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
637
637
 
638
638
  self.register_buffer("inv_freq", inv_freq, persistent=False)
639
- self.original_inv_freq = inv_freq
639
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
640
640
 
641
641
  @staticmethod
642
642
  def compute_default_rope_parameters(
@@ -1203,6 +1203,7 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
1203
1203
  attention_mask=None,
1204
1204
  cache_position=None,
1205
1205
  logits_to_keep=None,
1206
+ is_first_iteration=False,
1206
1207
  **kwargs,
1207
1208
  ):
1208
1209
  model_inputs = super().prepare_inputs_for_generation(
@@ -1212,12 +1213,15 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
1212
1213
  attention_mask=attention_mask,
1213
1214
  cache_position=cache_position,
1214
1215
  logits_to_keep=logits_to_keep,
1216
+ is_first_iteration=is_first_iteration,
1215
1217
  **kwargs,
1216
1218
  )
1217
1219
 
1218
- if cache_position[0] == 0:
1219
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
1220
- # Otherwise we need pixel values to be passed to model
1220
+ if is_first_iteration or not kwargs.get("use_cache", True):
1221
+ # Pixel values are used only in the first iteration if available
1222
+ # In subsquent iterations, they are already merged with text and cached
1223
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
1224
+ # iteration with a question and cached system prompt (continue generate from cache)
1221
1225
  model_inputs["pixel_values"] = pixel_values
1222
1226
  model_inputs["pixel_mask"] = pixel_mask
1223
1227
 
@@ -1500,6 +1500,7 @@ class AriaForConditionalGeneration(LlavaForConditionalGeneration):
1500
1500
  attention_mask=None,
1501
1501
  cache_position=None,
1502
1502
  logits_to_keep=None,
1503
+ is_first_iteration=False,
1503
1504
  **kwargs,
1504
1505
  ):
1505
1506
  model_inputs = super().prepare_inputs_for_generation(
@@ -1509,12 +1510,15 @@ class AriaForConditionalGeneration(LlavaForConditionalGeneration):
1509
1510
  attention_mask=attention_mask,
1510
1511
  cache_position=cache_position,
1511
1512
  logits_to_keep=logits_to_keep,
1513
+ is_first_iteration=is_first_iteration,
1512
1514
  **kwargs,
1513
1515
  )
1514
1516
 
1515
- if cache_position[0] == 0:
1516
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
1517
- # Otherwise we need pixel values to be passed to model
1517
+ if is_first_iteration or not kwargs.get("use_cache", True):
1518
+ # Pixel values are used only in the first iteration if available
1519
+ # In subsquent iterations, they are already merged with text and cached
1520
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
1521
+ # iteration with a question and cached system prompt (continue generate from cache)
1518
1522
  model_inputs["pixel_values"] = pixel_values
1519
1523
  model_inputs["pixel_mask"] = pixel_mask
1520
1524
 
@@ -32,9 +32,6 @@ if is_torch_available():
32
32
 
33
33
  logger = logging.get_logger(__name__)
34
34
 
35
- MAX_AUDIO_LEN = 10 * 60 # 10 minutes
36
- DEFAULT_TRANSCRIPTION_PROMPT = "Transcribe the input speech."
37
-
38
35
 
39
36
  class AudioFlamingo3ProcessorKwargs(ProcessingKwargs, total=False):
40
37
  _defaults = {
@@ -63,32 +60,41 @@ class AudioFlamingo3Processor(ProcessorMixin):
63
60
  [`Qwen2TokenizerFast`]. See the [`~AudioFlamingo3Processor.__call__`] for more information.
64
61
 
65
62
  Args:
66
- feature_extractor ([`WhisperFeatureExtractor`]):
67
- The feature extractor is a required input.
68
- tokenizer ([`Qwen2TokenizerFast`]):
69
- The tokenizer is a required input.
70
- chat_template (`Optional[str]`, *optional*):
71
- The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
72
- template will be used.
73
- audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
74
- Special token used to represent audio inputs in the chat template.
63
+ feature_extractor ([`WhisperFeatureExtractor`]):
64
+ The feature extractor is a required input.
65
+ tokenizer ([`Qwen2TokenizerFast`]):
66
+ The tokenizer is a required input.
67
+ chat_template (`Optional[str]`, *optional*):
68
+ The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
69
+ template will be used.
70
+ audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
71
+ Special token used to represent audio inputs in the chat template.
72
+ default_transcription_prompt (`str`, *optional*, defaults to `"Transcribe the input speech."`):
73
+ Default prompt to use for transcription tasks when applying transcription requests.
74
+ max_audio_len (`int`, *optional*, defaults to 600):
75
+ Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
75
76
  """
76
77
 
77
- attributes = ["feature_extractor", "tokenizer"]
78
- feature_extractor_class = "WhisperFeatureExtractor"
79
- tokenizer_class = "Qwen2TokenizerFast"
80
-
81
78
  def __init__(
82
79
  self,
83
80
  feature_extractor,
84
81
  tokenizer,
85
82
  chat_template=None,
86
83
  audio_token="<sound>",
84
+ default_transcription_prompt="Transcribe the input speech.",
85
+ max_audio_len=600,
87
86
  ):
88
87
  self.audio_token = audio_token
89
88
  self.audio_token_id = tokenizer.convert_tokens_to_ids(audio_token)
89
+ self.default_transcription_prompt = default_transcription_prompt
90
+ self.max_audio_len = max_audio_len
90
91
  super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
91
92
 
93
+ def _get_audio_token_length(self, audio_lengths: "torch.Tensor") -> "torch.Tensor":
94
+ conv_output_lengths = (audio_lengths - 1) // 2 + 1 # After conv2 downsampling
95
+ audio_tokens_lengths = (conv_output_lengths - 2) // 2 + 1 # After avg pooling
96
+ return audio_tokens_lengths
97
+
92
98
  def __call__(
93
99
  self,
94
100
  text: Union[TextInput, list[TextInput]],
@@ -143,7 +149,7 @@ class AudioFlamingo3Processor(ProcessorMixin):
143
149
 
144
150
  # Determine number of chunks per sample, and flatten
145
151
  window_size = int(audio_kwargs["sampling_rate"] * audio_kwargs["chunk_length"])
146
- max_windows = int(MAX_AUDIO_LEN // audio_kwargs["chunk_length"])
152
+ max_windows = int(self.max_audio_len // audio_kwargs["chunk_length"])
147
153
 
148
154
  per_sample_windows: list[int] = []
149
155
  flat_chunks: list[np.ndarray] = []
@@ -153,7 +159,7 @@ class AudioFlamingo3Processor(ProcessorMixin):
153
159
  n_win = max(1, (n_samples + window_size - 1) // window_size)
154
160
  if n_win > max_windows:
155
161
  logger.warning(
156
- f"Audio duration ({n_samples / audio_kwargs['sampling_rate']:.1f}s) exceeds {MAX_AUDIO_LEN}s; truncating to first {MAX_AUDIO_LEN}s."
162
+ f"Audio duration ({n_samples / audio_kwargs['sampling_rate']:.1f}s) exceeds {self.max_audio_len}s; truncating to first {self.max_audio_len}s."
157
163
  )
158
164
  n_win = max_windows
159
165
  per_sample_windows.append(n_win)
@@ -171,8 +177,7 @@ class AudioFlamingo3Processor(ProcessorMixin):
171
177
 
172
178
  # Compute sequence lengths token counting
173
179
  audio_lengths = torch.stack([s.sum() for s in torch.split(padding_mask.sum(-1), per_sample_windows)])
174
- conv_output_lengths = (audio_lengths - 1) // 2 + 1 # After conv2 downsampling
175
- audio_tokens_lengths = (conv_output_lengths - 2) // 2 + 1 # After avg pooling
180
+ audio_tokens_lengths = self._get_audio_token_length(audio_lengths)
176
181
 
177
182
  # expand audio tokens in text
178
183
  for i, audio_length in enumerate(audio_tokens_lengths):
@@ -236,7 +241,7 @@ class AudioFlamingo3Processor(ProcessorMixin):
236
241
  raise ValueError("`audio` must contain at least one sample.")
237
242
 
238
243
  if prompt is None:
239
- prompts = [DEFAULT_TRANSCRIPTION_PROMPT] * batch_size
244
+ prompts = [self.default_transcription_prompt] * batch_size
240
245
  elif isinstance(prompt, str):
241
246
  prompts = [prompt] * batch_size
242
247
  elif isinstance(prompt, (list, tuple)):
@@ -247,7 +252,7 @@ class AudioFlamingo3Processor(ProcessorMixin):
247
252
  prompts = []
248
253
  for item in prompt:
249
254
  if item is None:
250
- prompts.append(DEFAULT_TRANSCRIPTION_PROMPT)
255
+ prompts.append(self.default_transcription_prompt)
251
256
  elif isinstance(item, str):
252
257
  prompts.append(item)
253
258
  else:
@@ -543,7 +543,7 @@ def add_generation_mixin_to_remote_model(model_class):
543
543
 
544
544
  class _LazyAutoMapping(OrderedDict[type[PreTrainedConfig], _LazyAutoMappingValue]):
545
545
  """
546
- " A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed.
546
+ A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed.
547
547
 
548
548
  Args:
549
549
  - config_mapping: The map model type to config class
@@ -142,6 +142,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
142
142
  ("ernie", "ErnieConfig"),
143
143
  ("ernie4_5", "Ernie4_5Config"),
144
144
  ("ernie4_5_moe", "Ernie4_5_MoeConfig"),
145
+ ("ernie4_5_vl_moe", "Ernie4_5_VL_MoeConfig"),
145
146
  ("esm", "EsmConfig"),
146
147
  ("evolla", "EvollaConfig"),
147
148
  ("exaone4", "Exaone4Config"),
@@ -179,6 +180,8 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
179
180
  ("glm4v_moe_vision", "Glm4vMoeVisionConfig"),
180
181
  ("glm4v_text", "Glm4vTextConfig"),
181
182
  ("glm4v_vision", "Glm4vVisionConfig"),
183
+ ("glmasr", "GlmAsrConfig"),
184
+ ("glmasr_encoder", "GlmAsrEncoderConfig"),
182
185
  ("glpn", "GLPNConfig"),
183
186
  ("got_ocr2", "GotOcr2Config"),
184
187
  ("gpt-sw3", "GPT2Config"),
@@ -215,6 +218,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
215
218
  ("instructblipvideo", "InstructBlipVideoConfig"),
216
219
  ("internvl", "InternVLConfig"),
217
220
  ("internvl_vision", "InternVLVisionConfig"),
221
+ ("jais2", "Jais2Config"),
218
222
  ("jamba", "JambaConfig"),
219
223
  ("janus", "JanusConfig"),
220
224
  ("jetmoe", "JetMoeConfig"),
@@ -306,6 +310,12 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
306
310
  ("parakeet_encoder", "ParakeetEncoderConfig"),
307
311
  ("patchtsmixer", "PatchTSMixerConfig"),
308
312
  ("patchtst", "PatchTSTConfig"),
313
+ ("pe_audio", "PeAudioConfig"),
314
+ ("pe_audio_encoder", "PeAudioEncoderConfig"),
315
+ ("pe_audio_video", "PeAudioVideoConfig"),
316
+ ("pe_audio_video_encoder", "PeAudioVideoEncoderConfig"),
317
+ ("pe_video", "PeVideoConfig"),
318
+ ("pe_video_encoder", "PeVideoEncoderConfig"),
309
319
  ("pegasus", "PegasusConfig"),
310
320
  ("pegasus_x", "PegasusXConfig"),
311
321
  ("perceiver", "PerceiverConfig"),
@@ -316,6 +326,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
316
326
  ("phi4_multimodal", "Phi4MultimodalConfig"),
317
327
  ("phimoe", "PhimoeConfig"),
318
328
  ("pix2struct", "Pix2StructConfig"),
329
+ ("pixio", "PixioConfig"),
319
330
  ("pixtral", "PixtralVisionConfig"),
320
331
  ("plbart", "PLBartConfig"),
321
332
  ("poolformer", "PoolFormerConfig"),
@@ -582,6 +593,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
582
593
  ("ernie", "ERNIE"),
583
594
  ("ernie4_5", "Ernie4_5"),
584
595
  ("ernie4_5_moe", "Ernie4_5_MoE"),
596
+ ("ernie4_5_vl_moe", "Ernie4_5_VL_MoE"),
585
597
  ("esm", "ESM"),
586
598
  ("evolla", "Evolla"),
587
599
  ("exaone4", "EXAONE-4.0"),
@@ -622,6 +634,8 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
622
634
  ("glm4v_moe_vision", "Glm4vMoeVisionModel"),
623
635
  ("glm4v_text", "GLM4V"),
624
636
  ("glm4v_vision", "Glm4vVisionModel"),
637
+ ("glmasr", "GLM-ASR"),
638
+ ("glmasr_encoder", "GLM-ASR Encoder"),
625
639
  ("glpn", "GLPN"),
626
640
  ("got_ocr2", "GOT-OCR2"),
627
641
  ("gpt-sw3", "GPT-Sw3"),
@@ -659,6 +673,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
659
673
  ("instructblipvideo", "InstructBlipVideo"),
660
674
  ("internvl", "InternVL"),
661
675
  ("internvl_vision", "InternVLVision"),
676
+ ("jais2", "Jais2"),
662
677
  ("jamba", "Jamba"),
663
678
  ("janus", "Janus"),
664
679
  ("jetmoe", "JetMoe"),
@@ -762,6 +777,12 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
762
777
  ("parakeet_encoder", "ParakeetEncoder"),
763
778
  ("patchtsmixer", "PatchTSMixer"),
764
779
  ("patchtst", "PatchTST"),
780
+ ("pe_audio", "PeAudio"),
781
+ ("pe_audio_encoder", "PeAudioEncoder"),
782
+ ("pe_audio_video", "PeAudioVideo"),
783
+ ("pe_audio_video_encoder", "PeAudioVideoEncoder"),
784
+ ("pe_video", "PeVideo"),
785
+ ("pe_video_encoder", "PeVideoEncoder"),
765
786
  ("pegasus", "Pegasus"),
766
787
  ("pegasus_x", "PEGASUS-X"),
767
788
  ("perceiver", "Perceiver"),
@@ -773,6 +794,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
773
794
  ("phimoe", "Phimoe"),
774
795
  ("phobert", "PhoBERT"),
775
796
  ("pix2struct", "Pix2Struct"),
797
+ ("pixio", "Pixio"),
776
798
  ("pixtral", "Pixtral"),
777
799
  ("plbart", "PLBart"),
778
800
  ("poolformer", "PoolFormer"),
@@ -955,6 +977,7 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
955
977
  ("glm4v_moe_vision", "glm4v_moe"),
956
978
  ("glm4v_text", "glm4v"),
957
979
  ("glm4v_moe_text", "glm4v_moe"),
980
+ ("glmasr_encoder", "glmasr"),
958
981
  ("grounding-dino", "grounding_dino"),
959
982
  ("mm-grounding-dino", "mm_grounding_dino"),
960
983
  ("idefics3_vision", "idefics3"),
@@ -981,6 +1004,10 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
981
1004
  ("llama4_text", "llama4"),
982
1005
  ("blip_2_qformer", "blip_2"),
983
1006
  ("fastspeech2_conformer_with_hifigan", "fastspeech2_conformer"),
1007
+ ("perception_encoder", "perception_lm"),
1008
+ ("pe_audio_encoder", "pe_audio"),
1009
+ ("pe_video_encoder", "pe_video"),
1010
+ ("pe_audio_video_encoder", "pe_audio_video"),
984
1011
  ("video_llama_3_vision", "video_llama_3"),
985
1012
  ("parakeet_encoder", "parakeet"),
986
1013
  ("parakeet_ctc", "parakeet"),