transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (671) hide show
  1. transformers/__init__.py +20 -1
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/configuration_utils.py +114 -70
  6. transformers/conversion_mapping.py +68 -5
  7. transformers/core_model_loading.py +201 -35
  8. transformers/dependency_versions_table.py +1 -1
  9. transformers/feature_extraction_utils.py +54 -22
  10. transformers/generation/candidate_generator.py +79 -31
  11. transformers/generation/configuration_utils.py +162 -122
  12. transformers/generation/continuous_batching/cache.py +47 -18
  13. transformers/generation/continuous_batching/cache_manager.py +131 -34
  14. transformers/generation/continuous_batching/continuous_api.py +101 -64
  15. transformers/generation/continuous_batching/requests.py +28 -1
  16. transformers/generation/continuous_batching/scheduler.py +11 -4
  17. transformers/generation/stopping_criteria.py +1 -1
  18. transformers/generation/utils.py +108 -110
  19. transformers/generation/watermarking.py +8 -5
  20. transformers/image_processing_base.py +2 -12
  21. transformers/image_processing_utils_fast.py +15 -4
  22. transformers/initialization.py +37 -0
  23. transformers/integrations/__init__.py +12 -0
  24. transformers/integrations/accelerate.py +44 -111
  25. transformers/integrations/aqlm.py +3 -5
  26. transformers/integrations/awq.py +2 -5
  27. transformers/integrations/bitnet.py +5 -8
  28. transformers/integrations/bitsandbytes.py +16 -15
  29. transformers/integrations/deepspeed.py +18 -3
  30. transformers/integrations/eetq.py +3 -5
  31. transformers/integrations/fbgemm_fp8.py +1 -1
  32. transformers/integrations/finegrained_fp8.py +6 -16
  33. transformers/integrations/flash_attention.py +2 -2
  34. transformers/integrations/higgs.py +2 -5
  35. transformers/integrations/hub_kernels.py +23 -5
  36. transformers/integrations/integration_utils.py +35 -0
  37. transformers/integrations/mistral.py +12 -0
  38. transformers/integrations/moe.py +240 -0
  39. transformers/integrations/mxfp4.py +4 -10
  40. transformers/integrations/peft.py +5 -0
  41. transformers/integrations/quanto.py +5 -2
  42. transformers/integrations/spqr.py +3 -5
  43. transformers/integrations/tensor_parallel.py +167 -221
  44. transformers/integrations/vptq.py +3 -5
  45. transformers/modeling_gguf_pytorch_utils.py +66 -19
  46. transformers/modeling_rope_utils.py +78 -81
  47. transformers/modeling_utils.py +583 -503
  48. transformers/models/__init__.py +19 -0
  49. transformers/models/afmoe/modeling_afmoe.py +7 -16
  50. transformers/models/afmoe/modular_afmoe.py +5 -13
  51. transformers/models/aimv2/modeling_aimv2.py +4 -0
  52. transformers/models/aimv2/modular_aimv2.py +4 -0
  53. transformers/models/albert/modeling_albert.py +3 -0
  54. transformers/models/align/modeling_align.py +12 -6
  55. transformers/models/altclip/modeling_altclip.py +7 -3
  56. transformers/models/apertus/modeling_apertus.py +4 -2
  57. transformers/models/apertus/modular_apertus.py +4 -1
  58. transformers/models/arcee/modeling_arcee.py +1 -1
  59. transformers/models/aria/modeling_aria.py +8 -4
  60. transformers/models/aria/modular_aria.py +7 -3
  61. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  62. transformers/models/auto/auto_factory.py +1 -1
  63. transformers/models/auto/configuration_auto.py +27 -0
  64. transformers/models/auto/feature_extraction_auto.py +7 -3
  65. transformers/models/auto/image_processing_auto.py +4 -2
  66. transformers/models/auto/modeling_auto.py +31 -0
  67. transformers/models/auto/processing_auto.py +4 -0
  68. transformers/models/auto/tokenization_auto.py +132 -153
  69. transformers/models/auto/video_processing_auto.py +5 -2
  70. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  71. transformers/models/bamba/modeling_bamba.py +18 -19
  72. transformers/models/bamba/modular_bamba.py +17 -16
  73. transformers/models/bark/modeling_bark.py +9 -0
  74. transformers/models/bart/configuration_bart.py +0 -1
  75. transformers/models/bart/modeling_bart.py +7 -0
  76. transformers/models/beit/image_processing_beit_fast.py +0 -1
  77. transformers/models/bert/modeling_bert.py +3 -0
  78. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  79. transformers/models/big_bird/modeling_big_bird.py +3 -0
  80. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
  81. transformers/models/bit/modeling_bit.py +5 -1
  82. transformers/models/bitnet/modeling_bitnet.py +1 -1
  83. transformers/models/blenderbot/modeling_blenderbot.py +7 -0
  84. transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
  85. transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
  86. transformers/models/blip/modeling_blip.py +2 -0
  87. transformers/models/blip/modeling_blip_text.py +8 -0
  88. transformers/models/blip_2/modeling_blip_2.py +2 -0
  89. transformers/models/bloom/modeling_bloom.py +13 -44
  90. transformers/models/blt/modeling_blt.py +162 -2
  91. transformers/models/blt/modular_blt.py +168 -3
  92. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  93. transformers/models/bridgetower/modeling_bridgetower.py +6 -0
  94. transformers/models/bros/modeling_bros.py +8 -0
  95. transformers/models/camembert/modeling_camembert.py +109 -106
  96. transformers/models/canine/modeling_canine.py +6 -0
  97. transformers/models/canine/tokenization_canine.py +2 -0
  98. transformers/models/chameleon/modeling_chameleon.py +9 -4
  99. transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
  100. transformers/models/clap/feature_extraction_clap.py +2 -2
  101. transformers/models/clap/modeling_clap.py +25 -15
  102. transformers/models/clip/modeling_clip.py +2 -0
  103. transformers/models/clipseg/modeling_clipseg.py +4 -0
  104. transformers/models/clvp/modeling_clvp.py +14 -3
  105. transformers/models/code_llama/tokenization_code_llama.py +1 -1
  106. transformers/models/codegen/modeling_codegen.py +13 -4
  107. transformers/models/cohere/modeling_cohere.py +1 -1
  108. transformers/models/cohere2/modeling_cohere2.py +1 -1
  109. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
  110. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  111. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  112. transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
  113. transformers/models/convbert/modeling_convbert.py +3 -0
  114. transformers/models/convnext/image_processing_convnext.py +2 -2
  115. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  116. transformers/models/csm/generation_csm.py +19 -22
  117. transformers/models/csm/modeling_csm.py +3 -1
  118. transformers/models/csm/modular_csm.py +2 -0
  119. transformers/models/ctrl/modeling_ctrl.py +14 -2
  120. transformers/models/cvt/modeling_cvt.py +5 -1
  121. transformers/models/cwm/modeling_cwm.py +1 -1
  122. transformers/models/d_fine/configuration_d_fine.py +3 -4
  123. transformers/models/d_fine/modeling_d_fine.py +46 -39
  124. transformers/models/d_fine/modular_d_fine.py +15 -4
  125. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  126. transformers/models/dab_detr/modeling_dab_detr.py +1 -1
  127. transformers/models/dac/modeling_dac.py +4 -4
  128. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  129. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  130. transformers/models/dbrx/configuration_dbrx.py +9 -1
  131. transformers/models/dbrx/modeling_dbrx.py +1 -1
  132. transformers/models/deberta/modeling_deberta.py +2 -0
  133. transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
  134. transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
  135. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
  136. transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
  137. transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
  138. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
  139. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  140. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  141. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  142. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  143. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  144. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  145. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  146. transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
  147. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  148. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  149. transformers/models/detr/configuration_detr.py +1 -1
  150. transformers/models/detr/modeling_detr.py +8 -1
  151. transformers/models/dia/generation_dia.py +3 -10
  152. transformers/models/dia/modeling_dia.py +12 -1
  153. transformers/models/dia/modular_dia.py +11 -0
  154. transformers/models/dia/processing_dia.py +1 -1
  155. transformers/models/diffllama/modeling_diffllama.py +3 -3
  156. transformers/models/diffllama/modular_diffllama.py +2 -2
  157. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  158. transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
  159. transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
  160. transformers/models/distilbert/modeling_distilbert.py +11 -9
  161. transformers/models/doge/modeling_doge.py +1 -1
  162. transformers/models/donut/image_processing_donut_fast.py +0 -1
  163. transformers/models/donut/modeling_donut_swin.py +16 -12
  164. transformers/models/dots1/modeling_dots1.py +14 -5
  165. transformers/models/dpt/configuration_dpt.py +1 -1
  166. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  167. transformers/models/dpt/modular_dpt.py +1 -2
  168. transformers/models/edgetam/configuration_edgetam.py +1 -1
  169. transformers/models/edgetam/modeling_edgetam.py +5 -2
  170. transformers/models/edgetam/modular_edgetam.py +15 -14
  171. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
  172. transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
  173. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  174. transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
  175. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  176. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  177. transformers/models/efficientnet/modeling_efficientnet.py +5 -1
  178. transformers/models/electra/modeling_electra.py +7 -0
  179. transformers/models/emu3/modeling_emu3.py +8 -2
  180. transformers/models/emu3/modular_emu3.py +7 -1
  181. transformers/models/encodec/modeling_encodec.py +14 -0
  182. transformers/models/eomt/image_processing_eomt_fast.py +46 -14
  183. transformers/models/eomt/modeling_eomt.py +7 -0
  184. transformers/models/eomt/modular_eomt.py +7 -0
  185. transformers/models/ernie/modeling_ernie.py +6 -0
  186. transformers/models/ernie/modular_ernie.py +6 -0
  187. transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
  188. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
  189. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
  190. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  191. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  192. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  193. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  194. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  195. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  196. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  197. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  198. transformers/models/esm/modeling_esm.py +6 -0
  199. transformers/models/esm/modeling_esmfold.py +6 -1
  200. transformers/models/evolla/modeling_evolla.py +9 -1
  201. transformers/models/evolla/modular_evolla.py +8 -0
  202. transformers/models/exaone4/modeling_exaone4.py +1 -1
  203. transformers/models/falcon/modeling_falcon.py +3 -3
  204. transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
  205. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  206. transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
  207. transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
  208. transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
  209. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
  210. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  211. transformers/models/flaubert/modeling_flaubert.py +14 -15
  212. transformers/models/flava/image_processing_flava_fast.py +0 -2
  213. transformers/models/flava/modeling_flava.py +4 -1
  214. transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
  215. transformers/models/florence2/modeling_florence2.py +20 -3
  216. transformers/models/florence2/modular_florence2.py +13 -0
  217. transformers/models/fnet/modeling_fnet.py +7 -0
  218. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  219. transformers/models/fuyu/modeling_fuyu.py +3 -1
  220. transformers/models/fuyu/processing_fuyu.py +16 -0
  221. transformers/models/gemma/modeling_gemma.py +10 -12
  222. transformers/models/gemma/modular_gemma.py +9 -11
  223. transformers/models/gemma2/modeling_gemma2.py +1 -1
  224. transformers/models/gemma2/modular_gemma2.py +1 -1
  225. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  226. transformers/models/gemma3/modeling_gemma3.py +28 -7
  227. transformers/models/gemma3/modular_gemma3.py +26 -6
  228. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  229. transformers/models/gemma3n/modeling_gemma3n.py +47 -9
  230. transformers/models/gemma3n/modular_gemma3n.py +51 -9
  231. transformers/models/git/modeling_git.py +181 -126
  232. transformers/models/glm/modeling_glm.py +1 -1
  233. transformers/models/glm4/modeling_glm4.py +1 -1
  234. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  235. transformers/models/glm46v/modeling_glm46v.py +3 -1
  236. transformers/models/glm46v/modular_glm46v.py +3 -0
  237. transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
  238. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  239. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  240. transformers/models/glm4v/modeling_glm4v.py +15 -5
  241. transformers/models/glm4v/modular_glm4v.py +11 -3
  242. transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
  243. transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
  244. transformers/models/glmasr/__init__.py +30 -0
  245. transformers/models/glmasr/configuration_glmasr.py +197 -0
  246. transformers/models/glmasr/modeling_glmasr.py +512 -0
  247. transformers/models/glmasr/modular_glmasr.py +433 -0
  248. transformers/models/glmasr/processing_glmasr.py +332 -0
  249. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  250. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  251. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  252. transformers/models/gpt2/modeling_gpt2.py +8 -5
  253. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
  254. transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
  255. transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
  256. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
  257. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  258. transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
  259. transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
  260. transformers/models/gptj/modeling_gptj.py +15 -6
  261. transformers/models/granite/modeling_granite.py +1 -1
  262. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  263. transformers/models/granitemoe/modeling_granitemoe.py +2 -3
  264. transformers/models/granitemoe/modular_granitemoe.py +1 -2
  265. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  266. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
  267. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  268. transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
  269. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  270. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
  271. transformers/models/groupvit/modeling_groupvit.py +6 -1
  272. transformers/models/helium/modeling_helium.py +1 -1
  273. transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
  274. transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
  275. transformers/models/hubert/modeling_hubert.py +4 -0
  276. transformers/models/hubert/modular_hubert.py +4 -0
  277. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
  278. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  279. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  280. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
  281. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  282. transformers/models/ibert/modeling_ibert.py +16 -0
  283. transformers/models/idefics/modeling_idefics.py +10 -0
  284. transformers/models/idefics2/modeling_idefics2.py +7 -1
  285. transformers/models/idefics3/modeling_idefics3.py +5 -1
  286. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  287. transformers/models/imagegpt/modeling_imagegpt.py +9 -2
  288. transformers/models/instructblip/modeling_instructblip.py +2 -0
  289. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  290. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  291. transformers/models/internvl/modeling_internvl.py +11 -8
  292. transformers/models/internvl/modular_internvl.py +5 -9
  293. transformers/models/internvl/video_processing_internvl.py +0 -1
  294. transformers/models/jais2/__init__.py +27 -0
  295. transformers/models/jais2/configuration_jais2.py +152 -0
  296. transformers/models/jais2/modeling_jais2.py +486 -0
  297. transformers/models/jais2/modular_jais2.py +196 -0
  298. transformers/models/jamba/modeling_jamba.py +24 -19
  299. transformers/models/jamba/modular_jamba.py +17 -17
  300. transformers/models/janus/image_processing_janus_fast.py +0 -1
  301. transformers/models/janus/modeling_janus.py +15 -7
  302. transformers/models/janus/modular_janus.py +16 -7
  303. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  304. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  305. transformers/models/kosmos2/modeling_kosmos2.py +14 -2
  306. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  307. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  308. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
  309. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  310. transformers/models/lasr/configuration_lasr.py +4 -0
  311. transformers/models/lasr/modeling_lasr.py +3 -2
  312. transformers/models/lasr/modular_lasr.py +8 -1
  313. transformers/models/lasr/processing_lasr.py +0 -2
  314. transformers/models/layoutlm/modeling_layoutlm.py +5 -3
  315. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  316. transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
  317. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
  318. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  319. transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
  320. transformers/models/led/modeling_led.py +6 -0
  321. transformers/models/levit/modeling_levit.py +18 -0
  322. transformers/models/lfm2/modeling_lfm2.py +1 -1
  323. transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
  324. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  325. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  326. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  327. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  328. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  329. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  330. transformers/models/lilt/modeling_lilt.py +19 -15
  331. transformers/models/llama/modeling_llama.py +1 -1
  332. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  333. transformers/models/llama4/modeling_llama4.py +8 -4
  334. transformers/models/llava/image_processing_llava_fast.py +0 -1
  335. transformers/models/llava/modeling_llava.py +12 -7
  336. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  337. transformers/models/llava_next/modeling_llava_next.py +7 -3
  338. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  339. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  340. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  341. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  342. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  343. transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
  344. transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
  345. transformers/models/longt5/modeling_longt5.py +0 -4
  346. transformers/models/m2m_100/modeling_m2m_100.py +10 -0
  347. transformers/models/mamba/modeling_mamba.py +2 -1
  348. transformers/models/mamba2/modeling_mamba2.py +24 -23
  349. transformers/models/marian/configuration_marian.py +1 -1
  350. transformers/models/marian/modeling_marian.py +3 -0
  351. transformers/models/markuplm/modeling_markuplm.py +5 -8
  352. transformers/models/mask2former/configuration_mask2former.py +3 -3
  353. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  354. transformers/models/mask2former/modeling_mask2former.py +9 -0
  355. transformers/models/maskformer/configuration_maskformer.py +3 -3
  356. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  357. transformers/models/maskformer/modeling_maskformer.py +9 -1
  358. transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
  359. transformers/models/mbart/configuration_mbart.py +1 -0
  360. transformers/models/mbart/modeling_mbart.py +7 -0
  361. transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
  362. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  363. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  364. transformers/models/mimi/modeling_mimi.py +25 -4
  365. transformers/models/minimax/modeling_minimax.py +16 -3
  366. transformers/models/minimax/modular_minimax.py +12 -1
  367. transformers/models/ministral/modeling_ministral.py +1 -1
  368. transformers/models/ministral3/modeling_ministral3.py +1 -1
  369. transformers/models/mistral/modeling_mistral.py +1 -1
  370. transformers/models/mistral3/modeling_mistral3.py +10 -4
  371. transformers/models/mistral3/modular_mistral3.py +3 -1
  372. transformers/models/mixtral/modeling_mixtral.py +12 -4
  373. transformers/models/mixtral/modular_mixtral.py +6 -2
  374. transformers/models/mlcd/modeling_mlcd.py +6 -0
  375. transformers/models/mlcd/modular_mlcd.py +4 -0
  376. transformers/models/mllama/modeling_mllama.py +13 -2
  377. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  378. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
  379. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  380. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  381. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  382. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  383. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  384. transformers/models/mobilevit/modeling_mobilevit.py +4 -0
  385. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
  386. transformers/models/modernbert/modeling_modernbert.py +12 -1
  387. transformers/models/modernbert/modular_modernbert.py +12 -1
  388. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
  389. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
  390. transformers/models/moonshine/modeling_moonshine.py +1 -1
  391. transformers/models/moshi/modeling_moshi.py +21 -51
  392. transformers/models/mpnet/modeling_mpnet.py +2 -0
  393. transformers/models/mra/modeling_mra.py +4 -1
  394. transformers/models/mt5/configuration_mt5.py +2 -3
  395. transformers/models/mt5/modeling_mt5.py +0 -10
  396. transformers/models/musicgen/modeling_musicgen.py +5 -9
  397. transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
  398. transformers/models/mvp/modeling_mvp.py +7 -0
  399. transformers/models/nanochat/modeling_nanochat.py +1 -1
  400. transformers/models/nemotron/modeling_nemotron.py +3 -3
  401. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  402. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  403. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  404. transformers/models/nougat/tokenization_nougat.py +11 -16
  405. transformers/models/nystromformer/modeling_nystromformer.py +7 -0
  406. transformers/models/olmo/modeling_olmo.py +1 -1
  407. transformers/models/olmo2/modeling_olmo2.py +1 -1
  408. transformers/models/olmo3/modeling_olmo3.py +1 -1
  409. transformers/models/olmoe/modeling_olmoe.py +12 -4
  410. transformers/models/olmoe/modular_olmoe.py +4 -2
  411. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  412. transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
  413. transformers/models/oneformer/configuration_oneformer.py +3 -3
  414. transformers/models/oneformer/modeling_oneformer.py +7 -38
  415. transformers/models/openai/modeling_openai.py +12 -0
  416. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  417. transformers/models/ovis2/modeling_ovis2.py +15 -3
  418. transformers/models/ovis2/modular_ovis2.py +8 -0
  419. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  420. transformers/models/owlv2/modeling_owlv2.py +7 -3
  421. transformers/models/owlv2/modular_owlv2.py +0 -2
  422. transformers/models/owlvit/modeling_owlvit.py +7 -3
  423. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
  424. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
  425. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
  426. transformers/models/paligemma/modeling_paligemma.py +25 -17
  427. transformers/models/parakeet/modeling_parakeet.py +5 -0
  428. transformers/models/parakeet/modular_parakeet.py +5 -0
  429. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  430. transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
  431. transformers/models/patchtst/modeling_patchtst.py +5 -4
  432. transformers/models/pe_audio/__init__.py +30 -0
  433. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  434. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  435. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  436. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  437. transformers/models/pe_audio/processing_pe_audio.py +24 -0
  438. transformers/models/pe_audio_video/__init__.py +29 -0
  439. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  440. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  441. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  442. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  443. transformers/models/pe_video/__init__.py +30 -0
  444. transformers/models/pe_video/configuration_pe_video.py +211 -0
  445. transformers/models/pe_video/modeling_pe_video.py +636 -0
  446. transformers/models/pe_video/modular_pe_video.py +219 -0
  447. transformers/models/pe_video/processing_pe_video.py +10 -0
  448. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  449. transformers/models/pegasus/configuration_pegasus.py +1 -0
  450. transformers/models/pegasus/modeling_pegasus.py +3 -0
  451. transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
  452. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  453. transformers/models/perceiver/modeling_perceiver.py +5 -1
  454. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  455. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  456. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  457. transformers/models/persimmon/modeling_persimmon.py +1 -1
  458. transformers/models/phi/modeling_phi.py +1 -1
  459. transformers/models/phi3/modeling_phi3.py +1 -1
  460. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
  461. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
  462. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  463. transformers/models/phimoe/modeling_phimoe.py +12 -4
  464. transformers/models/phimoe/modular_phimoe.py +1 -1
  465. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  466. transformers/models/pixio/__init__.py +30 -0
  467. transformers/models/pixio/configuration_pixio.py +151 -0
  468. transformers/models/pixio/modeling_pixio.py +507 -0
  469. transformers/models/pixio/modular_pixio.py +404 -0
  470. transformers/models/pixtral/modeling_pixtral.py +1 -1
  471. transformers/models/pixtral/processing_pixtral.py +3 -1
  472. transformers/models/plbart/configuration_plbart.py +1 -0
  473. transformers/models/plbart/modeling_plbart.py +7 -0
  474. transformers/models/plbart/modular_plbart.py +6 -0
  475. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  476. transformers/models/poolformer/modeling_poolformer.py +11 -1
  477. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  478. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  479. transformers/models/prophetnet/modeling_prophetnet.py +2 -1
  480. transformers/models/qwen2/modeling_qwen2.py +1 -1
  481. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
  482. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
  483. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
  484. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
  485. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
  486. transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
  487. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  488. transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
  489. transformers/models/qwen3/modeling_qwen3.py +1 -1
  490. transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
  491. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
  492. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  493. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
  494. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
  495. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  496. transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
  497. transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
  498. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  499. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
  500. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
  501. transformers/models/rag/configuration_rag.py +0 -8
  502. transformers/models/rag/modeling_rag.py +7 -9
  503. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
  504. transformers/models/reformer/modeling_reformer.py +9 -1
  505. transformers/models/regnet/modeling_regnet.py +4 -0
  506. transformers/models/rembert/modeling_rembert.py +7 -1
  507. transformers/models/resnet/modeling_resnet.py +8 -3
  508. transformers/models/roberta/modeling_roberta.py +3 -0
  509. transformers/models/roberta/modular_roberta.py +3 -0
  510. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  511. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  512. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  513. transformers/models/rt_detr/modeling_rt_detr.py +4 -0
  514. transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
  515. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  516. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
  517. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  518. transformers/models/rwkv/modeling_rwkv.py +1 -1
  519. transformers/models/sam/configuration_sam.py +1 -0
  520. transformers/models/sam/image_processing_sam_fast.py +0 -1
  521. transformers/models/sam/modeling_sam.py +4 -1
  522. transformers/models/sam2/configuration_sam2.py +1 -1
  523. transformers/models/sam2/modeling_sam2.py +5 -1
  524. transformers/models/sam2/modular_sam2.py +5 -1
  525. transformers/models/sam2_video/modeling_sam2_video.py +51 -43
  526. transformers/models/sam2_video/modular_sam2_video.py +31 -18
  527. transformers/models/sam3/configuration_sam3.py +21 -1
  528. transformers/models/sam3/modeling_sam3.py +23 -0
  529. transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
  530. transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
  531. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  532. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
  533. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  534. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  535. transformers/models/sam3_video/modeling_sam3_video.py +3 -3
  536. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  537. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  538. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  539. transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
  540. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
  541. transformers/models/seed_oss/modeling_seed_oss.py +1 -1
  542. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  543. transformers/models/segformer/modeling_segformer.py +2 -2
  544. transformers/models/segformer/modular_segformer.py +0 -1
  545. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  546. transformers/models/siglip/modeling_siglip.py +24 -2
  547. transformers/models/siglip2/modeling_siglip2.py +63 -41
  548. transformers/models/smollm3/modeling_smollm3.py +1 -1
  549. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  550. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  551. transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
  552. transformers/models/speecht5/modeling_speecht5.py +28 -0
  553. transformers/models/splinter/modeling_splinter.py +9 -3
  554. transformers/models/squeezebert/modeling_squeezebert.py +2 -0
  555. transformers/models/stablelm/modeling_stablelm.py +1 -1
  556. transformers/models/starcoder2/modeling_starcoder2.py +1 -1
  557. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  558. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  559. transformers/models/swiftformer/modeling_swiftformer.py +4 -0
  560. transformers/models/swin/modeling_swin.py +16 -12
  561. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  562. transformers/models/swin2sr/modeling_swin2sr.py +49 -33
  563. transformers/models/swinv2/modeling_swinv2.py +41 -33
  564. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  565. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  566. transformers/models/t5/configuration_t5.py +7 -1
  567. transformers/models/t5/modeling_t5.py +1 -7
  568. transformers/models/t5gemma/modeling_t5gemma.py +1 -1
  569. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  570. transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
  571. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  572. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  573. transformers/models/table_transformer/modeling_table_transformer.py +1 -1
  574. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  575. transformers/models/timesfm/modeling_timesfm.py +12 -0
  576. transformers/models/timesfm/modular_timesfm.py +12 -0
  577. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  578. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  579. transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
  580. transformers/models/trocr/modeling_trocr.py +1 -2
  581. transformers/models/tvp/configuration_tvp.py +5 -1
  582. transformers/models/tvp/modeling_tvp.py +4 -4
  583. transformers/models/udop/configuration_udop.py +1 -0
  584. transformers/models/udop/modeling_udop.py +3 -7
  585. transformers/models/umt5/configuration_umt5.py +2 -2
  586. transformers/models/umt5/modeling_umt5.py +0 -6
  587. transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
  588. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  589. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  590. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  591. transformers/models/video_llava/modeling_video_llava.py +7 -3
  592. transformers/models/vilt/configuration_vilt.py +2 -2
  593. transformers/models/vilt/modeling_vilt.py +7 -0
  594. transformers/models/vipllava/modeling_vipllava.py +7 -3
  595. transformers/models/visual_bert/modeling_visual_bert.py +2 -0
  596. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  597. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  598. transformers/models/vitmatte/modeling_vitmatte.py +4 -0
  599. transformers/models/vitpose/configuration_vitpose.py +1 -1
  600. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  601. transformers/models/voxtral/modeling_voxtral.py +2 -2
  602. transformers/models/voxtral/modular_voxtral.py +2 -2
  603. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
  604. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
  605. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
  606. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  607. transformers/models/whisper/generation_whisper.py +1 -0
  608. transformers/models/whisper/modeling_whisper.py +5 -3
  609. transformers/models/x_clip/modeling_x_clip.py +2 -0
  610. transformers/models/xcodec/modeling_xcodec.py +5 -0
  611. transformers/models/xglm/modeling_xglm.py +10 -0
  612. transformers/models/xlm/modeling_xlm.py +13 -14
  613. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  614. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  615. transformers/models/xlnet/modeling_xlnet.py +3 -1
  616. transformers/models/xmod/modeling_xmod.py +3 -0
  617. transformers/models/yoso/modeling_yoso.py +4 -1
  618. transformers/models/zamba/modeling_zamba.py +2 -1
  619. transformers/models/zamba2/modeling_zamba2.py +3 -2
  620. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  621. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  622. transformers/models/zoedepth/modeling_zoedepth.py +7 -0
  623. transformers/pipelines/__init__.py +9 -6
  624. transformers/pipelines/automatic_speech_recognition.py +20 -12
  625. transformers/pipelines/base.py +1 -1
  626. transformers/pipelines/document_question_answering.py +1 -1
  627. transformers/pipelines/question_answering.py +1 -1
  628. transformers/pipelines/text_to_audio.py +2 -2
  629. transformers/processing_utils.py +127 -56
  630. transformers/quantizers/auto.py +2 -4
  631. transformers/quantizers/base.py +9 -64
  632. transformers/quantizers/quantizer_aqlm.py +1 -18
  633. transformers/quantizers/quantizer_auto_round.py +1 -10
  634. transformers/quantizers/quantizer_awq.py +3 -8
  635. transformers/quantizers/quantizer_bitnet.py +1 -6
  636. transformers/quantizers/quantizer_bnb_4bit.py +9 -49
  637. transformers/quantizers/quantizer_bnb_8bit.py +9 -19
  638. transformers/quantizers/quantizer_compressed_tensors.py +1 -4
  639. transformers/quantizers/quantizer_eetq.py +2 -12
  640. transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
  641. transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
  642. transformers/quantizers/quantizer_fp_quant.py +4 -4
  643. transformers/quantizers/quantizer_gptq.py +1 -4
  644. transformers/quantizers/quantizer_higgs.py +2 -6
  645. transformers/quantizers/quantizer_mxfp4.py +2 -28
  646. transformers/quantizers/quantizer_quanto.py +14 -14
  647. transformers/quantizers/quantizer_spqr.py +3 -8
  648. transformers/quantizers/quantizer_torchao.py +28 -124
  649. transformers/quantizers/quantizer_vptq.py +1 -10
  650. transformers/testing_utils.py +28 -12
  651. transformers/tokenization_mistral_common.py +3 -2
  652. transformers/tokenization_utils_base.py +3 -2
  653. transformers/tokenization_utils_tokenizers.py +25 -2
  654. transformers/trainer.py +24 -2
  655. transformers/trainer_callback.py +8 -0
  656. transformers/trainer_seq2seq.py +4 -0
  657. transformers/training_args.py +8 -10
  658. transformers/utils/__init__.py +4 -0
  659. transformers/utils/attention_visualizer.py +4 -4
  660. transformers/utils/auto_docstring.py +34 -25
  661. transformers/utils/generic.py +20 -0
  662. transformers/utils/import_utils.py +51 -9
  663. transformers/utils/kernel_config.py +71 -18
  664. transformers/utils/quantization_config.py +8 -8
  665. transformers/video_processing_utils.py +16 -12
  666. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
  667. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
  668. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
  669. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  670. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
  671. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -25,6 +25,7 @@ import torch
25
25
  import torch.nn.functional as F
26
26
  from torch import nn
27
27
 
28
+ from ... import initialization as init
28
29
  from ...activations import GELUActivation
29
30
  from ...cache_utils import Cache, DynamicCache
30
31
  from ...image_processing_utils import BatchFeature
@@ -776,6 +777,14 @@ class PaddleOCRVLPreTrainedModel(PreTrainedModel):
776
777
  "attentions": PaddleOCRAttention,
777
778
  }
778
779
 
780
+ def _init_weights(self, module):
781
+ super()._init_weights(module)
782
+ if isinstance(module, PaddleOCRVisionEmbeddings):
783
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
784
+ elif isinstance(module, PaddleOCRVisionRotaryEmbedding):
785
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
786
+ init.copy_(module.inv_freq, inv_freq)
787
+
779
788
 
780
789
  class PaddleOCRTextModel(PaddleOCRVLPreTrainedModel, Ernie4_5Model):
781
790
  def __init__(self, config: PaddleOCRTextConfig):
@@ -977,18 +986,17 @@ class PaddleOCRVisionEncoder(VideoLlama3VisionEncoder):
977
986
  attention_mask: Optional[torch.Tensor] = None,
978
987
  image_grid_thw: Optional[list[Union[tuple[int, int, int], list[tuple[int, int, int]]]]] = None,
979
988
  ) -> BaseModelOutput:
980
- """
981
- Args:
982
- inputs_embeds (`torch.FloatTensor` of shape `(sequence_length, hidden_size)`, *optional*):
983
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
984
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
985
- than the model's internal embedding lookup matrix.
986
- cu_seqlens (`torch.Tensor` of shape `(num_images + 1,)`):
987
- The cumulative sequence lengths of each image or video feature.
988
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
989
- The attention_mask used in forward function shape [batch_size X sequence_length] if not None.
990
- image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
991
- The temporal, height and width of feature shape of each image in LLM.
989
+ r"""
990
+ inputs_embeds (`torch.FloatTensor` of shape `(sequence_length, hidden_size)`, *optional*):
991
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
992
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
993
+ than the model's internal embedding lookup matrix.
994
+ cu_seqlens (`torch.Tensor` of shape `(num_images + 1,)`):
995
+ The cumulative sequence lengths of each image or video feature.
996
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
997
+ The attention_mask used in forward function shape [batch_size X sequence_length] if not None.
998
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
999
+ The temporal, height and width of feature shape of each image in LLM.
992
1000
  """
993
1001
  device = inputs_embeds.device
994
1002
  hidden_states = inputs_embeds
@@ -1037,6 +1045,8 @@ class PaddleOCRVisionTransformer(PaddleOCRVLPreTrainedModel):
1037
1045
  self.encoder = PaddleOCRVisionEncoder(config)
1038
1046
  self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
1039
1047
 
1048
+ self.post_init()
1049
+
1040
1050
  def forward(
1041
1051
  self,
1042
1052
  pixel_values: torch.FloatTensor,
@@ -149,7 +149,8 @@ def create_causal_mask_mapping(
149
149
  position_ids: Optional[torch.Tensor],
150
150
  token_type_ids: Optional[torch.Tensor] = None,
151
151
  pixel_values: Optional[torch.FloatTensor] = None,
152
- is_training: bool = False,
152
+ is_training: Optional[bool] = False,
153
+ is_first_iteration: Optional[bool] = None,
153
154
  **kwargs,
154
155
  ) -> dict:
155
156
  """
@@ -169,31 +170,33 @@ def create_causal_mask_mapping(
169
170
  "past_key_values": past_key_values,
170
171
  "position_ids": position_ids,
171
172
  }
172
- # NOTE: this `is_prompt` logic is not flawless, it fails when we're using a cache eagerly initialized
173
- # (e.g. compiled prefill) AND `pixel_values` are not provided (i.e. the image data is provided through other
174
- # means). Determining prefill in that case requires checking data values, which is not compile-compatible.
175
- maybe_is_prompt = past_key_values is None or not past_key_values.is_initialized or pixel_values is not None
176
-
177
- if maybe_is_prompt:
173
+ # Infer if prefill or decoding stage, if the flag isn't passed. This happens only when the mask is constructed
174
+ # from `forward` call. If users run a `forward` call, we have no option to infer `is_first_iteration` because users may be
175
+ # running generation with custom loop. Thus we need to infer it in a `non-perfect` way
176
+ # NOTE: Determining prefill in that case requires checking data values, which is not compile-compatible.
177
+ is_first_iteration = (
178
+ is_first_iteration
179
+ if is_first_iteration
180
+ else (past_key_values is None or not past_key_values.is_initialized or pixel_values is not None)
181
+ )
182
+
183
+ if is_first_iteration or not kwargs.get("use_cache", True):
178
184
  if token_type_ids is not None:
179
185
  # The logic bellow was originally written for Gemma3, where `token_type_ids` is reversed. Let's reverse
180
186
  # it to then use exactly the same logic.
181
187
  token_type_ids = 1 - token_type_ids
182
188
  else:
183
189
  logger.warning_once(
184
- "The input may be the prompt, but `token_type_ids` is not provided. We recommend "
190
+ "It is a prefill stage but The `token_type_ids` is not provided. We recommend "
185
191
  "passing `token_type_ids` to the model to prevent bad attention masking."
186
192
  )
187
- # BC: when NOT training, use bidirectional mask if sequence length > 1. Otherwise, use the default causal
188
- # mask. This is incorrect in some advanced use cases, hence the warning above.
189
193
  # NOTE: this branch can't be reached when training because `token_type_ids` is required as a model input.
190
- if input_embeds.shape[1] > 1:
191
- token_type_ids = torch.ones_like(input_embeds)[:, :, 0]
194
+ token_type_ids = torch.ones_like(input_embeds)[:, :, 0]
192
195
 
193
196
  # Logic originally copied from Gemma3. It holds up for Paligemma as well because Paligemma assumes up to one image
194
197
  # per prompt AND we reverse `token_type_ids` above. Gemma3 uses a bidirectional mask for images, tagged through
195
198
  # `token_type_ids` 1s.
196
- if token_type_ids is not None and maybe_is_prompt:
199
+ if token_type_ids is not None and is_first_iteration:
197
200
  # We need to pass an additional mask function to account for token type ids, and it needs to be an `or` (to
198
201
  # undo the causal masking)
199
202
 
@@ -550,6 +553,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
550
553
  use_cache=True,
551
554
  logits_to_keep=None,
552
555
  labels=None,
556
+ is_first_iteration=False,
553
557
  **kwargs,
554
558
  ):
555
559
  # Overwritten -- custom `position_ids` and `pixel_values` handling
@@ -563,6 +567,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
563
567
  use_cache=use_cache,
564
568
  logits_to_keep=logits_to_keep,
565
569
  token_type_ids=token_type_ids,
570
+ is_first_iteration=is_first_iteration,
566
571
  **kwargs,
567
572
  )
568
573
 
@@ -570,9 +575,11 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
570
575
  if model_inputs.get("position_ids") is not None:
571
576
  model_inputs["position_ids"] += 1
572
577
 
573
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
574
- # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
575
- if cache_position[0] == 0:
578
+ # Pixel values are used only in the first iteration if available
579
+ # In subsquent iterations, they are already merged with text and cached
580
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
581
+ # iteration with a question and cached system prompt (continue generate from cache). NOTE: use_cache=False needs pixel_values always
582
+ if is_first_iteration or not use_cache:
576
583
  model_inputs["pixel_values"] = pixel_values
577
584
 
578
585
  return model_inputs
@@ -586,6 +593,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
586
593
  past_key_values: Optional[Cache],
587
594
  position_ids: Optional[torch.Tensor],
588
595
  token_type_ids: Optional[torch.Tensor] = None,
596
+ is_first_iteration: Optional[bool] = False,
589
597
  **kwargs,
590
598
  ) -> dict:
591
599
  # Uses the overwritten `create_masks_for_generate` with `token_type_ids` masking
@@ -597,7 +605,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
597
605
  past_key_values,
598
606
  position_ids,
599
607
  token_type_ids,
600
- pixel_values=kwargs.get("pixel_values"),
608
+ is_first_iteration=is_first_iteration,
601
609
  **{k: v for k, v in kwargs.items() if k != "pixel_values"},
602
610
  )
603
611
 
@@ -510,6 +510,11 @@ class ParakeetPreTrainedModel(PreTrainedModel):
510
510
  # Initialize positional bias parameters
511
511
  init.normal_(module.bias_u, mean=0.0, std=std)
512
512
  init.normal_(module.bias_v, mean=0.0, std=std)
513
+ elif isinstance(module, ParakeetEncoderRelPositionalEncoding):
514
+ inv_freq = 1.0 / (
515
+ 10000.0 ** (torch.arange(0, self.config.hidden_size, 2, dtype=torch.int64) / self.config.hidden_size)
516
+ )
517
+ init.copy_(module.inv_freq, inv_freq)
513
518
 
514
519
  def _get_subsampling_output_length(self, input_lengths: torch.Tensor):
515
520
  encoder_config = self.config.encoder_config if isinstance(self.config, ParakeetCTCConfig) else self.config
@@ -346,6 +346,11 @@ class ParakeetPreTrainedModel(PreTrainedModel):
346
346
  # Initialize positional bias parameters
347
347
  init.normal_(module.bias_u, mean=0.0, std=std)
348
348
  init.normal_(module.bias_v, mean=0.0, std=std)
349
+ elif isinstance(module, ParakeetEncoderRelPositionalEncoding):
350
+ inv_freq = 1.0 / (
351
+ 10000.0 ** (torch.arange(0, self.config.hidden_size, 2, dtype=torch.int64) / self.config.hidden_size)
352
+ )
353
+ init.copy_(module.inv_freq, inv_freq)
349
354
 
350
355
  def _get_subsampling_output_length(self, input_lengths: torch.Tensor):
351
356
  encoder_config = self.config.encoder_config if isinstance(self.config, ParakeetCTCConfig) else self.config
@@ -16,10 +16,10 @@
16
16
  import itertools
17
17
  from typing import Optional, Union
18
18
 
19
- from ...tokenization_utils_tokenizers import PreTrainedTokenizerFast
19
+ from ...tokenization_utils_tokenizers import TokenizersBackend
20
20
 
21
21
 
22
- class ParakeetTokenizerFast(PreTrainedTokenizerFast):
22
+ class ParakeetTokenizer(TokenizersBackend):
23
23
  """
24
24
  Inherits all methods from [`PreTrainedTokenizerFast`]. Users should refer to this superclass for more information regarding those methods,
25
25
  except for `_decode` which is overridden to adapt it to CTC decoding:
@@ -51,4 +51,4 @@ class ParakeetTokenizerFast(PreTrainedTokenizerFast):
51
51
  )
52
52
 
53
53
 
54
- __all__ = ["ParakeetTokenizerFast"]
54
+ __all__ = ["ParakeetTokenizer"]
@@ -696,6 +696,10 @@ class PatchTSMixerPreTrainedModel(PreTrainedModel):
696
696
  elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
697
697
  init.zeros_(module.bias)
698
698
  init.ones_(module.weight)
699
+ if getattr(module, "running_mean", None) is not None:
700
+ init.zeros_(module.running_mean)
701
+ init.ones_(module.running_var)
702
+ init.zeros_(module.num_batches_tracked)
699
703
  elif isinstance(module, PatchTSMixerBatchNorm):
700
704
  init.zeros_(module.batchnorm.bias)
701
705
  init.ones_(module.batchnorm.weight)
@@ -584,12 +584,13 @@ class PatchTSTPreTrainedModel(PreTrainedModel):
584
584
  init.copy_(module.position_enc, position_enc)
585
585
  else:
586
586
  init.copy_(module.position_enc, position_enc)
587
- elif isinstance(module, nn.LayerNorm):
587
+ elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
588
588
  init.zeros_(module.bias)
589
589
  init.ones_(module.weight)
590
- elif isinstance(module, PatchTSTBatchNorm):
591
- init.zeros_(module.batchnorm.bias)
592
- init.ones_(module.batchnorm.weight)
590
+ if getattr(module, "running_mean", None) is not None:
591
+ init.zeros_(module.running_mean)
592
+ init.ones_(module.running_var)
593
+ init.zeros_(module.num_batches_tracked)
593
594
  elif isinstance(module, nn.Linear):
594
595
  init.normal_(module.weight, mean=0.0, std=self.config.init_std)
595
596
  if module.bias is not None:
@@ -0,0 +1,30 @@
1
+ # coding=utf-8
2
+ # Copyright 2025 the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from typing import TYPE_CHECKING
16
+
17
+ from ...utils import _LazyModule
18
+ from ...utils.import_utils import define_import_structure
19
+
20
+
21
+ if TYPE_CHECKING:
22
+ from .configuration_pe_audio import *
23
+ from .feature_extraction_pe_audio import *
24
+ from .modeling_pe_audio import *
25
+ from .processing_pe_audio import *
26
+ else:
27
+ import sys
28
+
29
+ _file = globals()["__file__"]
30
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
@@ -0,0 +1,206 @@
1
+ # coding=utf-8
2
+ # Copyright 2025 the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from typing import Optional, Union
16
+
17
+ from ...configuration_utils import PreTrainedConfig, PretrainedConfig
18
+ from ...modeling_rope_utils import RopeParameters
19
+ from ...utils import logging
20
+ from ..auto import CONFIG_MAPPING, AutoConfig
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+
26
+ class PeAudioEncoderConfig(PreTrainedConfig):
27
+ r"""
28
+ This is the configuration class to store the configuration of a [`PeAudioEncoder`]. It is used to instantiate a
29
+ PeAudioEncoder model according to the specified arguments, defining the model architecture. Instantiating a configuration
30
+ with the defaults will yield a similar configuration to that of pe-av-large.
31
+ e.g. [facebook/pe-av-large](https://huggingface.co/facebook/pe-av-large)
32
+
33
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
34
+ documentation from [`PreTrainedConfig`] for more information.
35
+
36
+
37
+ Args:
38
+ dac_config (`Union[PreTrainedConfig, dict]`, *optional*):
39
+ Configuration for the DAC audio encoder used to tokenize the raw audio inputs. If a dictionary is passed, it
40
+ will be used to instantiate a [`~transformers.DacConfig`] with default DAC hyperparameters.
41
+ hidden_size (`int`, *optional*, defaults to 1792):
42
+ Dimension of the hidden representations.
43
+ intermediate_size (`int`, *optional*, defaults to 4800):
44
+ Dimension of the feedforward layers in the Transformer blocks.
45
+ num_hidden_layers (`int`, *optional*, defaults to 6):
46
+ Number of Transformer encoder blocks.
47
+ num_attention_heads (`int`, *optional*, defaults to 14):
48
+ Number of attention heads used in each attention layer.
49
+ num_key_value_heads (`int`, *optional*):
50
+ Number of key and value heads for grouped-query attention. If unset, this defaults to `num_attention_heads`.
51
+ head_dim (`int`, *optional*, defaults to 128):
52
+ Dimension of each attention head for query, key, and value projections.
53
+ hidden_act (`str`, *optional*, defaults to `"silu"`):
54
+ The non-linear activation function (function or string) in the Transformer blocks.
55
+ max_position_embeddings (`int`, *optional*, defaults to 10000):
56
+ Maximum sequence length supported by the rotary position embeddings.
57
+ initializer_range (`float`, *optional*, defaults to 0.02):
58
+ Standard deviation of the truncated normal initializer for weight matrices.
59
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
60
+ Epsilon used by the RMS normalization layers.
61
+ rope_parameters (`Union[RopeParameters, dict]`, *optional*, defaults to `{'rope_theta': 20000}`):
62
+ Parameters for the rotary position embeddings, such as the base `rope_theta`.
63
+ attention_bias (`bool`, *optional*, defaults to `False`):
64
+ Whether to use bias terms in the query, key, value, and output projections.
65
+ attention_dropout (`float`, *optional*, defaults to 0.0):
66
+ Dropout ratio applied to attention probabilities.
67
+
68
+ ```python
69
+ >>> from transformers import PeAudioEncoder, PeAudioEncoderConfig
70
+
71
+ >>> # Initializing a PeAudioEncoder style configuration
72
+ >>> configuration = PeAudioEncoderConfig()
73
+
74
+ >>> # Initializing a model from the pe-av-large style configuration
75
+ >>> model = PeAudioEncoder(configuration)
76
+
77
+ >>> # Accessing the model configuration
78
+ >>> configuration = model.config
79
+ ```"""
80
+
81
+ model_type = "pe_audio_encoder"
82
+ sub_configs = {"dac_config": AutoConfig}
83
+ base_config_key = "audio_video_config"
84
+
85
+ _default_dac_config_kwargs = {
86
+ "downsampling_ratios": [2, 8, 10, 12],
87
+ "encoder_hidden_size": 64,
88
+ "codebook_dim": 128,
89
+ }
90
+
91
+ def __init__(
92
+ self,
93
+ dac_config: Optional[Union[dict, PreTrainedConfig]] = None,
94
+ hidden_size: Optional[int] = 1792,
95
+ intermediate_size: Optional[int] = 4800,
96
+ num_hidden_layers: Optional[int] = 6,
97
+ num_attention_heads: Optional[int] = 14,
98
+ num_key_value_heads: Optional[int] = None,
99
+ head_dim: Optional[int] = 128,
100
+ hidden_act: Optional[str] = "silu",
101
+ max_position_embeddings: Optional[int] = 10000,
102
+ initializer_range: Optional[float] = 0.02,
103
+ rms_norm_eps: Optional[float] = 1e-5,
104
+ rope_parameters: Optional[Union[RopeParameters, dict]] = {"rope_theta": 20000},
105
+ attention_bias: Optional[bool] = False,
106
+ attention_dropout: Optional[float] = 0.0,
107
+ **kwargs,
108
+ ):
109
+ self.hidden_size = hidden_size
110
+ self.intermediate_size = intermediate_size
111
+ self.num_hidden_layers = num_hidden_layers
112
+ self.num_attention_heads = num_attention_heads
113
+
114
+ # for backward compatibility
115
+ if num_key_value_heads is None:
116
+ num_key_value_heads = num_attention_heads
117
+
118
+ self.num_key_value_heads = num_key_value_heads
119
+ self.head_dim = head_dim
120
+ self.hidden_act = hidden_act
121
+ self.max_position_embeddings = max_position_embeddings
122
+ self.initializer_range = initializer_range
123
+ self.rms_norm_eps = rms_norm_eps
124
+ self.rope_parameters = rope_parameters
125
+ self.attention_bias = attention_bias
126
+ self.attention_dropout = attention_dropout
127
+
128
+ if isinstance(dac_config, dict):
129
+ dac_config["model_type"] = dac_config.get("model_type", "dac")
130
+ dac_config = CONFIG_MAPPING[dac_config["model_type"]](**{**self._default_dac_config_kwargs, **dac_config})
131
+ elif dac_config is None:
132
+ dac_config = CONFIG_MAPPING["dac"](**self._default_dac_config_kwargs)
133
+
134
+ self.dac_config = dac_config
135
+
136
+ super().__init__(**kwargs)
137
+
138
+
139
+ class PeAudioConfig(PretrainedConfig):
140
+ r"""
141
+ This is the configuration class to store the configuration of a [`PeAudioModel`]. It is used to instantiate a
142
+ PeAudioModel model according to the specified arguments, defining the model architecture. Instantiating a configuration
143
+ with the defaults will yield a similar configuration to that of pe-av-large.
144
+ e.g. [facebook/pe-av-large](https://huggingface.co/facebook/pe-av-large)
145
+
146
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
147
+ documentation from [`PreTrainedConfig`] for more information.
148
+
149
+
150
+ Args:
151
+ text_config (`dict` or `PreTrainedConfig`, *optional*):
152
+ Configuration for the text model component.
153
+ audio_config (`dict` or `PreTrainedConfig`, *optional*):
154
+ Configuration for the audio encoder component.
155
+
156
+ ```python
157
+ >>> from transformers import PeAudioModel, PeAudioConfig
158
+
159
+ >>> # Initializing a PeAudioModel style configuration
160
+ >>> configuration = PeAudioConfig()
161
+
162
+ >>> # Initializing a model from the pe-av-large style configuration
163
+ >>> model = PeAudioModel(configuration)
164
+
165
+ >>> # Accessing the model configuration
166
+ >>> configuration = model.config
167
+ ```"""
168
+
169
+ model_type = "pe_audio"
170
+ sub_configs = {"text_config": AutoConfig, "audio_config": PeAudioEncoderConfig}
171
+ base_config_key = "audio_video_config"
172
+
173
+ _default_text_config_kwargs = {
174
+ "model_type": "modernbert",
175
+ "hidden_size": 1024,
176
+ "intermediate_size": 2624,
177
+ "num_hidden_layers": 22,
178
+ "num_attention_heads": 16,
179
+ }
180
+
181
+ def __init__(
182
+ self,
183
+ text_config=None,
184
+ audio_config=None,
185
+ **kwargs,
186
+ ):
187
+ if isinstance(text_config, dict):
188
+ text_config["model_type"] = text_config.get("model_type", "modernbert")
189
+ text_config = CONFIG_MAPPING[text_config["model_type"]](
190
+ **{**self._default_text_config_kwargs, **text_config}
191
+ )
192
+ elif text_config is None:
193
+ text_config = CONFIG_MAPPING["modernbert"](**self._default_text_config_kwargs)
194
+
195
+ if isinstance(audio_config, dict):
196
+ audio_config = PeAudioEncoderConfig(**audio_config)
197
+ elif audio_config is None:
198
+ audio_config = PeAudioEncoderConfig()
199
+
200
+ self.text_config = text_config
201
+ self.audio_config = audio_config
202
+
203
+ super().__init__(**kwargs)
204
+
205
+
206
+ __all__ = ["PeAudioEncoderConfig", "PeAudioConfig"]
@@ -0,0 +1,162 @@
1
+ # coding=utf-8
2
+ # Copyright 2025 the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from typing import Optional, Union
16
+
17
+ import numpy as np
18
+
19
+ from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
20
+ from ...feature_extraction_utils import BatchFeature
21
+ from ...processing_utils import load_audio
22
+ from ...utils import PaddingStrategy, TensorType, logging
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+
28
+ class PeAudioFeatureExtractor(SequenceFeatureExtractor):
29
+ r"""
30
+ Constructs a PeAudioFeatureExtractor feature extractor.
31
+
32
+ This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
33
+ most of the main methods. Users should refer to this superclass for more information regarding those methods.
34
+
35
+ Args:
36
+ feature_size (`int`, *optional*, defaults to 1):
37
+ The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
38
+ sampling_rate (`int`, *optional*, defaults to 48000):
39
+ The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
40
+ padding_value (`float`, *optional*, defaults to 0.0):
41
+ The value that is used for padding.
42
+ hop_length (`int`, *optional*, defaults to 1920):
43
+ Overlap length between successive windows.
44
+ """
45
+
46
+ model_input_names = ["input_values"]
47
+
48
+ def __init__(
49
+ self,
50
+ feature_size: int = 1,
51
+ sampling_rate: int = 48_000,
52
+ padding_value: float = 0.0,
53
+ hop_length: int = 1920,
54
+ **kwargs,
55
+ ):
56
+ super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
57
+ self.hop_length = hop_length
58
+
59
+ def _reflect_pad(self, wav):
60
+ if len(wav) % self.hop_length == 0:
61
+ return wav
62
+ p1d = (0, self.hop_length - (len(wav) % self.hop_length))
63
+ return np.pad(wav, p1d, "reflect")
64
+
65
+ def __call__(
66
+ self,
67
+ raw_audio: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]], str, list[str]],
68
+ padding: Optional[Union[bool, str, PaddingStrategy]] = None,
69
+ truncation: Optional[bool] = False,
70
+ max_length: Optional[int] = None,
71
+ return_tensors: Optional[Union[str, TensorType]] = None,
72
+ sampling_rate: Optional[int] = None,
73
+ ) -> BatchFeature:
74
+ from_file = False
75
+ if isinstance(raw_audio, str):
76
+ raw_audio = [raw_audio]
77
+
78
+ if isinstance(raw_audio, (list, tuple)) and isinstance(raw_audio[0], str):
79
+ loaded = []
80
+ for audio_file in raw_audio:
81
+ loaded.append(load_audio(audio_file, self.sampling_rate))
82
+ raw_audio = loaded
83
+ from_file = True
84
+
85
+ if sampling_rate is not None:
86
+ if sampling_rate != self.sampling_rate:
87
+ raise ValueError(
88
+ f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
89
+ f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
90
+ f" {self.sampling_rate} and not {sampling_rate}."
91
+ )
92
+ elif not from_file:
93
+ logger.warning(
94
+ f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
95
+ "Failing to do so can result in silent errors that might be hard to debug."
96
+ )
97
+
98
+ if padding and truncation:
99
+ raise ValueError("Both padding and truncation were set. Make sure you only set one.")
100
+ elif padding is None:
101
+ # by default let's pad the inputs
102
+ padding = True
103
+
104
+ is_batched = bool(
105
+ isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
106
+ )
107
+
108
+ if is_batched:
109
+ raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
110
+ elif not is_batched and not isinstance(raw_audio, np.ndarray):
111
+ raw_audio = np.asarray(raw_audio, dtype=np.float32)
112
+ elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
113
+ raw_audio = raw_audio.astype(np.float32)
114
+
115
+ # always return batch
116
+ if not is_batched:
117
+ raw_audio = [np.asarray(raw_audio).T]
118
+
119
+ if isinstance(raw_audio, list):
120
+ raw_audio = [self._reflect_pad(x) for x in raw_audio]
121
+ else:
122
+ raw_audio = self._reflect_pad(raw_audio)
123
+
124
+ # verify inputs are valid
125
+ for example in raw_audio:
126
+ if example.ndim > 2:
127
+ raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
128
+ if self.feature_size == 1 and example.ndim != 1:
129
+ raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
130
+ if self.feature_size == 2:
131
+ raise ValueError("Stereo audio isn't supported for now")
132
+
133
+ input_values = BatchFeature({"input_values": raw_audio})
134
+
135
+ # normal padding on batch
136
+ padded_inputs = self.pad(
137
+ input_values,
138
+ max_length=max_length,
139
+ truncation=truncation,
140
+ padding=padding,
141
+ return_attention_mask=padding,
142
+ pad_to_multiple_of=self.hop_length,
143
+ )
144
+ if padding:
145
+ padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
146
+ if padding:
147
+ padded_inputs.input_values = padded_inputs.input_values[:, np.newaxis, :]
148
+
149
+ input_values = []
150
+ for example in padded_inputs.pop("input_values"):
151
+ if self.feature_size == 1:
152
+ example = example[..., None]
153
+ input_values.append(example.T)
154
+
155
+ padded_inputs["input_values"] = input_values
156
+ if return_tensors is not None:
157
+ padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
158
+
159
+ return padded_inputs
160
+
161
+
162
+ __all__ = ["PeAudioFeatureExtractor"]