transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (671) hide show
  1. transformers/__init__.py +20 -1
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/configuration_utils.py +114 -70
  6. transformers/conversion_mapping.py +68 -5
  7. transformers/core_model_loading.py +201 -35
  8. transformers/dependency_versions_table.py +1 -1
  9. transformers/feature_extraction_utils.py +54 -22
  10. transformers/generation/candidate_generator.py +79 -31
  11. transformers/generation/configuration_utils.py +162 -122
  12. transformers/generation/continuous_batching/cache.py +47 -18
  13. transformers/generation/continuous_batching/cache_manager.py +131 -34
  14. transformers/generation/continuous_batching/continuous_api.py +101 -64
  15. transformers/generation/continuous_batching/requests.py +28 -1
  16. transformers/generation/continuous_batching/scheduler.py +11 -4
  17. transformers/generation/stopping_criteria.py +1 -1
  18. transformers/generation/utils.py +108 -110
  19. transformers/generation/watermarking.py +8 -5
  20. transformers/image_processing_base.py +2 -12
  21. transformers/image_processing_utils_fast.py +15 -4
  22. transformers/initialization.py +37 -0
  23. transformers/integrations/__init__.py +12 -0
  24. transformers/integrations/accelerate.py +44 -111
  25. transformers/integrations/aqlm.py +3 -5
  26. transformers/integrations/awq.py +2 -5
  27. transformers/integrations/bitnet.py +5 -8
  28. transformers/integrations/bitsandbytes.py +16 -15
  29. transformers/integrations/deepspeed.py +18 -3
  30. transformers/integrations/eetq.py +3 -5
  31. transformers/integrations/fbgemm_fp8.py +1 -1
  32. transformers/integrations/finegrained_fp8.py +6 -16
  33. transformers/integrations/flash_attention.py +2 -2
  34. transformers/integrations/higgs.py +2 -5
  35. transformers/integrations/hub_kernels.py +23 -5
  36. transformers/integrations/integration_utils.py +35 -0
  37. transformers/integrations/mistral.py +12 -0
  38. transformers/integrations/moe.py +240 -0
  39. transformers/integrations/mxfp4.py +4 -10
  40. transformers/integrations/peft.py +5 -0
  41. transformers/integrations/quanto.py +5 -2
  42. transformers/integrations/spqr.py +3 -5
  43. transformers/integrations/tensor_parallel.py +167 -221
  44. transformers/integrations/vptq.py +3 -5
  45. transformers/modeling_gguf_pytorch_utils.py +66 -19
  46. transformers/modeling_rope_utils.py +78 -81
  47. transformers/modeling_utils.py +583 -503
  48. transformers/models/__init__.py +19 -0
  49. transformers/models/afmoe/modeling_afmoe.py +7 -16
  50. transformers/models/afmoe/modular_afmoe.py +5 -13
  51. transformers/models/aimv2/modeling_aimv2.py +4 -0
  52. transformers/models/aimv2/modular_aimv2.py +4 -0
  53. transformers/models/albert/modeling_albert.py +3 -0
  54. transformers/models/align/modeling_align.py +12 -6
  55. transformers/models/altclip/modeling_altclip.py +7 -3
  56. transformers/models/apertus/modeling_apertus.py +4 -2
  57. transformers/models/apertus/modular_apertus.py +4 -1
  58. transformers/models/arcee/modeling_arcee.py +1 -1
  59. transformers/models/aria/modeling_aria.py +8 -4
  60. transformers/models/aria/modular_aria.py +7 -3
  61. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  62. transformers/models/auto/auto_factory.py +1 -1
  63. transformers/models/auto/configuration_auto.py +27 -0
  64. transformers/models/auto/feature_extraction_auto.py +7 -3
  65. transformers/models/auto/image_processing_auto.py +4 -2
  66. transformers/models/auto/modeling_auto.py +31 -0
  67. transformers/models/auto/processing_auto.py +4 -0
  68. transformers/models/auto/tokenization_auto.py +132 -153
  69. transformers/models/auto/video_processing_auto.py +5 -2
  70. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  71. transformers/models/bamba/modeling_bamba.py +18 -19
  72. transformers/models/bamba/modular_bamba.py +17 -16
  73. transformers/models/bark/modeling_bark.py +9 -0
  74. transformers/models/bart/configuration_bart.py +0 -1
  75. transformers/models/bart/modeling_bart.py +7 -0
  76. transformers/models/beit/image_processing_beit_fast.py +0 -1
  77. transformers/models/bert/modeling_bert.py +3 -0
  78. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  79. transformers/models/big_bird/modeling_big_bird.py +3 -0
  80. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
  81. transformers/models/bit/modeling_bit.py +5 -1
  82. transformers/models/bitnet/modeling_bitnet.py +1 -1
  83. transformers/models/blenderbot/modeling_blenderbot.py +7 -0
  84. transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
  85. transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
  86. transformers/models/blip/modeling_blip.py +2 -0
  87. transformers/models/blip/modeling_blip_text.py +8 -0
  88. transformers/models/blip_2/modeling_blip_2.py +2 -0
  89. transformers/models/bloom/modeling_bloom.py +13 -44
  90. transformers/models/blt/modeling_blt.py +162 -2
  91. transformers/models/blt/modular_blt.py +168 -3
  92. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  93. transformers/models/bridgetower/modeling_bridgetower.py +6 -0
  94. transformers/models/bros/modeling_bros.py +8 -0
  95. transformers/models/camembert/modeling_camembert.py +109 -106
  96. transformers/models/canine/modeling_canine.py +6 -0
  97. transformers/models/canine/tokenization_canine.py +2 -0
  98. transformers/models/chameleon/modeling_chameleon.py +9 -4
  99. transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
  100. transformers/models/clap/feature_extraction_clap.py +2 -2
  101. transformers/models/clap/modeling_clap.py +25 -15
  102. transformers/models/clip/modeling_clip.py +2 -0
  103. transformers/models/clipseg/modeling_clipseg.py +4 -0
  104. transformers/models/clvp/modeling_clvp.py +14 -3
  105. transformers/models/code_llama/tokenization_code_llama.py +1 -1
  106. transformers/models/codegen/modeling_codegen.py +13 -4
  107. transformers/models/cohere/modeling_cohere.py +1 -1
  108. transformers/models/cohere2/modeling_cohere2.py +1 -1
  109. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
  110. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  111. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  112. transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
  113. transformers/models/convbert/modeling_convbert.py +3 -0
  114. transformers/models/convnext/image_processing_convnext.py +2 -2
  115. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  116. transformers/models/csm/generation_csm.py +19 -22
  117. transformers/models/csm/modeling_csm.py +3 -1
  118. transformers/models/csm/modular_csm.py +2 -0
  119. transformers/models/ctrl/modeling_ctrl.py +14 -2
  120. transformers/models/cvt/modeling_cvt.py +5 -1
  121. transformers/models/cwm/modeling_cwm.py +1 -1
  122. transformers/models/d_fine/configuration_d_fine.py +3 -4
  123. transformers/models/d_fine/modeling_d_fine.py +46 -39
  124. transformers/models/d_fine/modular_d_fine.py +15 -4
  125. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  126. transformers/models/dab_detr/modeling_dab_detr.py +1 -1
  127. transformers/models/dac/modeling_dac.py +4 -4
  128. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  129. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  130. transformers/models/dbrx/configuration_dbrx.py +9 -1
  131. transformers/models/dbrx/modeling_dbrx.py +1 -1
  132. transformers/models/deberta/modeling_deberta.py +2 -0
  133. transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
  134. transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
  135. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
  136. transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
  137. transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
  138. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
  139. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  140. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  141. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  142. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  143. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  144. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  145. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  146. transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
  147. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  148. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  149. transformers/models/detr/configuration_detr.py +1 -1
  150. transformers/models/detr/modeling_detr.py +8 -1
  151. transformers/models/dia/generation_dia.py +3 -10
  152. transformers/models/dia/modeling_dia.py +12 -1
  153. transformers/models/dia/modular_dia.py +11 -0
  154. transformers/models/dia/processing_dia.py +1 -1
  155. transformers/models/diffllama/modeling_diffllama.py +3 -3
  156. transformers/models/diffllama/modular_diffllama.py +2 -2
  157. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  158. transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
  159. transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
  160. transformers/models/distilbert/modeling_distilbert.py +11 -9
  161. transformers/models/doge/modeling_doge.py +1 -1
  162. transformers/models/donut/image_processing_donut_fast.py +0 -1
  163. transformers/models/donut/modeling_donut_swin.py +16 -12
  164. transformers/models/dots1/modeling_dots1.py +14 -5
  165. transformers/models/dpt/configuration_dpt.py +1 -1
  166. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  167. transformers/models/dpt/modular_dpt.py +1 -2
  168. transformers/models/edgetam/configuration_edgetam.py +1 -1
  169. transformers/models/edgetam/modeling_edgetam.py +5 -2
  170. transformers/models/edgetam/modular_edgetam.py +15 -14
  171. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
  172. transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
  173. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  174. transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
  175. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  176. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  177. transformers/models/efficientnet/modeling_efficientnet.py +5 -1
  178. transformers/models/electra/modeling_electra.py +7 -0
  179. transformers/models/emu3/modeling_emu3.py +8 -2
  180. transformers/models/emu3/modular_emu3.py +7 -1
  181. transformers/models/encodec/modeling_encodec.py +14 -0
  182. transformers/models/eomt/image_processing_eomt_fast.py +46 -14
  183. transformers/models/eomt/modeling_eomt.py +7 -0
  184. transformers/models/eomt/modular_eomt.py +7 -0
  185. transformers/models/ernie/modeling_ernie.py +6 -0
  186. transformers/models/ernie/modular_ernie.py +6 -0
  187. transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
  188. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
  189. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
  190. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  191. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  192. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  193. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  194. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  195. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  196. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  197. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  198. transformers/models/esm/modeling_esm.py +6 -0
  199. transformers/models/esm/modeling_esmfold.py +6 -1
  200. transformers/models/evolla/modeling_evolla.py +9 -1
  201. transformers/models/evolla/modular_evolla.py +8 -0
  202. transformers/models/exaone4/modeling_exaone4.py +1 -1
  203. transformers/models/falcon/modeling_falcon.py +3 -3
  204. transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
  205. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  206. transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
  207. transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
  208. transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
  209. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
  210. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  211. transformers/models/flaubert/modeling_flaubert.py +14 -15
  212. transformers/models/flava/image_processing_flava_fast.py +0 -2
  213. transformers/models/flava/modeling_flava.py +4 -1
  214. transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
  215. transformers/models/florence2/modeling_florence2.py +20 -3
  216. transformers/models/florence2/modular_florence2.py +13 -0
  217. transformers/models/fnet/modeling_fnet.py +7 -0
  218. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  219. transformers/models/fuyu/modeling_fuyu.py +3 -1
  220. transformers/models/fuyu/processing_fuyu.py +16 -0
  221. transformers/models/gemma/modeling_gemma.py +10 -12
  222. transformers/models/gemma/modular_gemma.py +9 -11
  223. transformers/models/gemma2/modeling_gemma2.py +1 -1
  224. transformers/models/gemma2/modular_gemma2.py +1 -1
  225. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  226. transformers/models/gemma3/modeling_gemma3.py +28 -7
  227. transformers/models/gemma3/modular_gemma3.py +26 -6
  228. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  229. transformers/models/gemma3n/modeling_gemma3n.py +47 -9
  230. transformers/models/gemma3n/modular_gemma3n.py +51 -9
  231. transformers/models/git/modeling_git.py +181 -126
  232. transformers/models/glm/modeling_glm.py +1 -1
  233. transformers/models/glm4/modeling_glm4.py +1 -1
  234. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  235. transformers/models/glm46v/modeling_glm46v.py +3 -1
  236. transformers/models/glm46v/modular_glm46v.py +3 -0
  237. transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
  238. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  239. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  240. transformers/models/glm4v/modeling_glm4v.py +15 -5
  241. transformers/models/glm4v/modular_glm4v.py +11 -3
  242. transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
  243. transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
  244. transformers/models/glmasr/__init__.py +30 -0
  245. transformers/models/glmasr/configuration_glmasr.py +197 -0
  246. transformers/models/glmasr/modeling_glmasr.py +512 -0
  247. transformers/models/glmasr/modular_glmasr.py +433 -0
  248. transformers/models/glmasr/processing_glmasr.py +332 -0
  249. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  250. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  251. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  252. transformers/models/gpt2/modeling_gpt2.py +8 -5
  253. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
  254. transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
  255. transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
  256. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
  257. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  258. transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
  259. transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
  260. transformers/models/gptj/modeling_gptj.py +15 -6
  261. transformers/models/granite/modeling_granite.py +1 -1
  262. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  263. transformers/models/granitemoe/modeling_granitemoe.py +2 -3
  264. transformers/models/granitemoe/modular_granitemoe.py +1 -2
  265. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  266. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
  267. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  268. transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
  269. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  270. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
  271. transformers/models/groupvit/modeling_groupvit.py +6 -1
  272. transformers/models/helium/modeling_helium.py +1 -1
  273. transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
  274. transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
  275. transformers/models/hubert/modeling_hubert.py +4 -0
  276. transformers/models/hubert/modular_hubert.py +4 -0
  277. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
  278. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  279. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  280. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
  281. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  282. transformers/models/ibert/modeling_ibert.py +16 -0
  283. transformers/models/idefics/modeling_idefics.py +10 -0
  284. transformers/models/idefics2/modeling_idefics2.py +7 -1
  285. transformers/models/idefics3/modeling_idefics3.py +5 -1
  286. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  287. transformers/models/imagegpt/modeling_imagegpt.py +9 -2
  288. transformers/models/instructblip/modeling_instructblip.py +2 -0
  289. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  290. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  291. transformers/models/internvl/modeling_internvl.py +11 -8
  292. transformers/models/internvl/modular_internvl.py +5 -9
  293. transformers/models/internvl/video_processing_internvl.py +0 -1
  294. transformers/models/jais2/__init__.py +27 -0
  295. transformers/models/jais2/configuration_jais2.py +152 -0
  296. transformers/models/jais2/modeling_jais2.py +486 -0
  297. transformers/models/jais2/modular_jais2.py +196 -0
  298. transformers/models/jamba/modeling_jamba.py +24 -19
  299. transformers/models/jamba/modular_jamba.py +17 -17
  300. transformers/models/janus/image_processing_janus_fast.py +0 -1
  301. transformers/models/janus/modeling_janus.py +15 -7
  302. transformers/models/janus/modular_janus.py +16 -7
  303. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  304. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  305. transformers/models/kosmos2/modeling_kosmos2.py +14 -2
  306. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  307. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  308. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
  309. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  310. transformers/models/lasr/configuration_lasr.py +4 -0
  311. transformers/models/lasr/modeling_lasr.py +3 -2
  312. transformers/models/lasr/modular_lasr.py +8 -1
  313. transformers/models/lasr/processing_lasr.py +0 -2
  314. transformers/models/layoutlm/modeling_layoutlm.py +5 -3
  315. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  316. transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
  317. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
  318. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  319. transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
  320. transformers/models/led/modeling_led.py +6 -0
  321. transformers/models/levit/modeling_levit.py +18 -0
  322. transformers/models/lfm2/modeling_lfm2.py +1 -1
  323. transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
  324. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  325. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  326. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  327. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  328. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  329. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  330. transformers/models/lilt/modeling_lilt.py +19 -15
  331. transformers/models/llama/modeling_llama.py +1 -1
  332. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  333. transformers/models/llama4/modeling_llama4.py +8 -4
  334. transformers/models/llava/image_processing_llava_fast.py +0 -1
  335. transformers/models/llava/modeling_llava.py +12 -7
  336. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  337. transformers/models/llava_next/modeling_llava_next.py +7 -3
  338. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  339. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  340. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  341. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  342. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  343. transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
  344. transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
  345. transformers/models/longt5/modeling_longt5.py +0 -4
  346. transformers/models/m2m_100/modeling_m2m_100.py +10 -0
  347. transformers/models/mamba/modeling_mamba.py +2 -1
  348. transformers/models/mamba2/modeling_mamba2.py +24 -23
  349. transformers/models/marian/configuration_marian.py +1 -1
  350. transformers/models/marian/modeling_marian.py +3 -0
  351. transformers/models/markuplm/modeling_markuplm.py +5 -8
  352. transformers/models/mask2former/configuration_mask2former.py +3 -3
  353. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  354. transformers/models/mask2former/modeling_mask2former.py +9 -0
  355. transformers/models/maskformer/configuration_maskformer.py +3 -3
  356. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  357. transformers/models/maskformer/modeling_maskformer.py +9 -1
  358. transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
  359. transformers/models/mbart/configuration_mbart.py +1 -0
  360. transformers/models/mbart/modeling_mbart.py +7 -0
  361. transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
  362. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  363. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  364. transformers/models/mimi/modeling_mimi.py +25 -4
  365. transformers/models/minimax/modeling_minimax.py +16 -3
  366. transformers/models/minimax/modular_minimax.py +12 -1
  367. transformers/models/ministral/modeling_ministral.py +1 -1
  368. transformers/models/ministral3/modeling_ministral3.py +1 -1
  369. transformers/models/mistral/modeling_mistral.py +1 -1
  370. transformers/models/mistral3/modeling_mistral3.py +10 -4
  371. transformers/models/mistral3/modular_mistral3.py +3 -1
  372. transformers/models/mixtral/modeling_mixtral.py +12 -4
  373. transformers/models/mixtral/modular_mixtral.py +6 -2
  374. transformers/models/mlcd/modeling_mlcd.py +6 -0
  375. transformers/models/mlcd/modular_mlcd.py +4 -0
  376. transformers/models/mllama/modeling_mllama.py +13 -2
  377. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  378. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
  379. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  380. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  381. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  382. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  383. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  384. transformers/models/mobilevit/modeling_mobilevit.py +4 -0
  385. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
  386. transformers/models/modernbert/modeling_modernbert.py +12 -1
  387. transformers/models/modernbert/modular_modernbert.py +12 -1
  388. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
  389. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
  390. transformers/models/moonshine/modeling_moonshine.py +1 -1
  391. transformers/models/moshi/modeling_moshi.py +21 -51
  392. transformers/models/mpnet/modeling_mpnet.py +2 -0
  393. transformers/models/mra/modeling_mra.py +4 -1
  394. transformers/models/mt5/configuration_mt5.py +2 -3
  395. transformers/models/mt5/modeling_mt5.py +0 -10
  396. transformers/models/musicgen/modeling_musicgen.py +5 -9
  397. transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
  398. transformers/models/mvp/modeling_mvp.py +7 -0
  399. transformers/models/nanochat/modeling_nanochat.py +1 -1
  400. transformers/models/nemotron/modeling_nemotron.py +3 -3
  401. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  402. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  403. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  404. transformers/models/nougat/tokenization_nougat.py +11 -16
  405. transformers/models/nystromformer/modeling_nystromformer.py +7 -0
  406. transformers/models/olmo/modeling_olmo.py +1 -1
  407. transformers/models/olmo2/modeling_olmo2.py +1 -1
  408. transformers/models/olmo3/modeling_olmo3.py +1 -1
  409. transformers/models/olmoe/modeling_olmoe.py +12 -4
  410. transformers/models/olmoe/modular_olmoe.py +4 -2
  411. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  412. transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
  413. transformers/models/oneformer/configuration_oneformer.py +3 -3
  414. transformers/models/oneformer/modeling_oneformer.py +7 -38
  415. transformers/models/openai/modeling_openai.py +12 -0
  416. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  417. transformers/models/ovis2/modeling_ovis2.py +15 -3
  418. transformers/models/ovis2/modular_ovis2.py +8 -0
  419. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  420. transformers/models/owlv2/modeling_owlv2.py +7 -3
  421. transformers/models/owlv2/modular_owlv2.py +0 -2
  422. transformers/models/owlvit/modeling_owlvit.py +7 -3
  423. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
  424. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
  425. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
  426. transformers/models/paligemma/modeling_paligemma.py +25 -17
  427. transformers/models/parakeet/modeling_parakeet.py +5 -0
  428. transformers/models/parakeet/modular_parakeet.py +5 -0
  429. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  430. transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
  431. transformers/models/patchtst/modeling_patchtst.py +5 -4
  432. transformers/models/pe_audio/__init__.py +30 -0
  433. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  434. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  435. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  436. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  437. transformers/models/pe_audio/processing_pe_audio.py +24 -0
  438. transformers/models/pe_audio_video/__init__.py +29 -0
  439. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  440. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  441. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  442. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  443. transformers/models/pe_video/__init__.py +30 -0
  444. transformers/models/pe_video/configuration_pe_video.py +211 -0
  445. transformers/models/pe_video/modeling_pe_video.py +636 -0
  446. transformers/models/pe_video/modular_pe_video.py +219 -0
  447. transformers/models/pe_video/processing_pe_video.py +10 -0
  448. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  449. transformers/models/pegasus/configuration_pegasus.py +1 -0
  450. transformers/models/pegasus/modeling_pegasus.py +3 -0
  451. transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
  452. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  453. transformers/models/perceiver/modeling_perceiver.py +5 -1
  454. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  455. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  456. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  457. transformers/models/persimmon/modeling_persimmon.py +1 -1
  458. transformers/models/phi/modeling_phi.py +1 -1
  459. transformers/models/phi3/modeling_phi3.py +1 -1
  460. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
  461. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
  462. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  463. transformers/models/phimoe/modeling_phimoe.py +12 -4
  464. transformers/models/phimoe/modular_phimoe.py +1 -1
  465. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  466. transformers/models/pixio/__init__.py +30 -0
  467. transformers/models/pixio/configuration_pixio.py +151 -0
  468. transformers/models/pixio/modeling_pixio.py +507 -0
  469. transformers/models/pixio/modular_pixio.py +404 -0
  470. transformers/models/pixtral/modeling_pixtral.py +1 -1
  471. transformers/models/pixtral/processing_pixtral.py +3 -1
  472. transformers/models/plbart/configuration_plbart.py +1 -0
  473. transformers/models/plbart/modeling_plbart.py +7 -0
  474. transformers/models/plbart/modular_plbart.py +6 -0
  475. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  476. transformers/models/poolformer/modeling_poolformer.py +11 -1
  477. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  478. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  479. transformers/models/prophetnet/modeling_prophetnet.py +2 -1
  480. transformers/models/qwen2/modeling_qwen2.py +1 -1
  481. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
  482. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
  483. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
  484. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
  485. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
  486. transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
  487. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  488. transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
  489. transformers/models/qwen3/modeling_qwen3.py +1 -1
  490. transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
  491. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
  492. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  493. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
  494. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
  495. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  496. transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
  497. transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
  498. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  499. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
  500. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
  501. transformers/models/rag/configuration_rag.py +0 -8
  502. transformers/models/rag/modeling_rag.py +7 -9
  503. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
  504. transformers/models/reformer/modeling_reformer.py +9 -1
  505. transformers/models/regnet/modeling_regnet.py +4 -0
  506. transformers/models/rembert/modeling_rembert.py +7 -1
  507. transformers/models/resnet/modeling_resnet.py +8 -3
  508. transformers/models/roberta/modeling_roberta.py +3 -0
  509. transformers/models/roberta/modular_roberta.py +3 -0
  510. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  511. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  512. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  513. transformers/models/rt_detr/modeling_rt_detr.py +4 -0
  514. transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
  515. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  516. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
  517. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  518. transformers/models/rwkv/modeling_rwkv.py +1 -1
  519. transformers/models/sam/configuration_sam.py +1 -0
  520. transformers/models/sam/image_processing_sam_fast.py +0 -1
  521. transformers/models/sam/modeling_sam.py +4 -1
  522. transformers/models/sam2/configuration_sam2.py +1 -1
  523. transformers/models/sam2/modeling_sam2.py +5 -1
  524. transformers/models/sam2/modular_sam2.py +5 -1
  525. transformers/models/sam2_video/modeling_sam2_video.py +51 -43
  526. transformers/models/sam2_video/modular_sam2_video.py +31 -18
  527. transformers/models/sam3/configuration_sam3.py +21 -1
  528. transformers/models/sam3/modeling_sam3.py +23 -0
  529. transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
  530. transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
  531. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  532. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
  533. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  534. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  535. transformers/models/sam3_video/modeling_sam3_video.py +3 -3
  536. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  537. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  538. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  539. transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
  540. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
  541. transformers/models/seed_oss/modeling_seed_oss.py +1 -1
  542. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  543. transformers/models/segformer/modeling_segformer.py +2 -2
  544. transformers/models/segformer/modular_segformer.py +0 -1
  545. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  546. transformers/models/siglip/modeling_siglip.py +24 -2
  547. transformers/models/siglip2/modeling_siglip2.py +63 -41
  548. transformers/models/smollm3/modeling_smollm3.py +1 -1
  549. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  550. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  551. transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
  552. transformers/models/speecht5/modeling_speecht5.py +28 -0
  553. transformers/models/splinter/modeling_splinter.py +9 -3
  554. transformers/models/squeezebert/modeling_squeezebert.py +2 -0
  555. transformers/models/stablelm/modeling_stablelm.py +1 -1
  556. transformers/models/starcoder2/modeling_starcoder2.py +1 -1
  557. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  558. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  559. transformers/models/swiftformer/modeling_swiftformer.py +4 -0
  560. transformers/models/swin/modeling_swin.py +16 -12
  561. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  562. transformers/models/swin2sr/modeling_swin2sr.py +49 -33
  563. transformers/models/swinv2/modeling_swinv2.py +41 -33
  564. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  565. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  566. transformers/models/t5/configuration_t5.py +7 -1
  567. transformers/models/t5/modeling_t5.py +1 -7
  568. transformers/models/t5gemma/modeling_t5gemma.py +1 -1
  569. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  570. transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
  571. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  572. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  573. transformers/models/table_transformer/modeling_table_transformer.py +1 -1
  574. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  575. transformers/models/timesfm/modeling_timesfm.py +12 -0
  576. transformers/models/timesfm/modular_timesfm.py +12 -0
  577. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  578. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  579. transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
  580. transformers/models/trocr/modeling_trocr.py +1 -2
  581. transformers/models/tvp/configuration_tvp.py +5 -1
  582. transformers/models/tvp/modeling_tvp.py +4 -4
  583. transformers/models/udop/configuration_udop.py +1 -0
  584. transformers/models/udop/modeling_udop.py +3 -7
  585. transformers/models/umt5/configuration_umt5.py +2 -2
  586. transformers/models/umt5/modeling_umt5.py +0 -6
  587. transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
  588. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  589. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  590. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  591. transformers/models/video_llava/modeling_video_llava.py +7 -3
  592. transformers/models/vilt/configuration_vilt.py +2 -2
  593. transformers/models/vilt/modeling_vilt.py +7 -0
  594. transformers/models/vipllava/modeling_vipllava.py +7 -3
  595. transformers/models/visual_bert/modeling_visual_bert.py +2 -0
  596. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  597. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  598. transformers/models/vitmatte/modeling_vitmatte.py +4 -0
  599. transformers/models/vitpose/configuration_vitpose.py +1 -1
  600. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  601. transformers/models/voxtral/modeling_voxtral.py +2 -2
  602. transformers/models/voxtral/modular_voxtral.py +2 -2
  603. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
  604. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
  605. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
  606. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  607. transformers/models/whisper/generation_whisper.py +1 -0
  608. transformers/models/whisper/modeling_whisper.py +5 -3
  609. transformers/models/x_clip/modeling_x_clip.py +2 -0
  610. transformers/models/xcodec/modeling_xcodec.py +5 -0
  611. transformers/models/xglm/modeling_xglm.py +10 -0
  612. transformers/models/xlm/modeling_xlm.py +13 -14
  613. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  614. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  615. transformers/models/xlnet/modeling_xlnet.py +3 -1
  616. transformers/models/xmod/modeling_xmod.py +3 -0
  617. transformers/models/yoso/modeling_yoso.py +4 -1
  618. transformers/models/zamba/modeling_zamba.py +2 -1
  619. transformers/models/zamba2/modeling_zamba2.py +3 -2
  620. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  621. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  622. transformers/models/zoedepth/modeling_zoedepth.py +7 -0
  623. transformers/pipelines/__init__.py +9 -6
  624. transformers/pipelines/automatic_speech_recognition.py +20 -12
  625. transformers/pipelines/base.py +1 -1
  626. transformers/pipelines/document_question_answering.py +1 -1
  627. transformers/pipelines/question_answering.py +1 -1
  628. transformers/pipelines/text_to_audio.py +2 -2
  629. transformers/processing_utils.py +127 -56
  630. transformers/quantizers/auto.py +2 -4
  631. transformers/quantizers/base.py +9 -64
  632. transformers/quantizers/quantizer_aqlm.py +1 -18
  633. transformers/quantizers/quantizer_auto_round.py +1 -10
  634. transformers/quantizers/quantizer_awq.py +3 -8
  635. transformers/quantizers/quantizer_bitnet.py +1 -6
  636. transformers/quantizers/quantizer_bnb_4bit.py +9 -49
  637. transformers/quantizers/quantizer_bnb_8bit.py +9 -19
  638. transformers/quantizers/quantizer_compressed_tensors.py +1 -4
  639. transformers/quantizers/quantizer_eetq.py +2 -12
  640. transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
  641. transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
  642. transformers/quantizers/quantizer_fp_quant.py +4 -4
  643. transformers/quantizers/quantizer_gptq.py +1 -4
  644. transformers/quantizers/quantizer_higgs.py +2 -6
  645. transformers/quantizers/quantizer_mxfp4.py +2 -28
  646. transformers/quantizers/quantizer_quanto.py +14 -14
  647. transformers/quantizers/quantizer_spqr.py +3 -8
  648. transformers/quantizers/quantizer_torchao.py +28 -124
  649. transformers/quantizers/quantizer_vptq.py +1 -10
  650. transformers/testing_utils.py +28 -12
  651. transformers/tokenization_mistral_common.py +3 -2
  652. transformers/tokenization_utils_base.py +3 -2
  653. transformers/tokenization_utils_tokenizers.py +25 -2
  654. transformers/trainer.py +24 -2
  655. transformers/trainer_callback.py +8 -0
  656. transformers/trainer_seq2seq.py +4 -0
  657. transformers/training_args.py +8 -10
  658. transformers/utils/__init__.py +4 -0
  659. transformers/utils/attention_visualizer.py +4 -4
  660. transformers/utils/auto_docstring.py +34 -25
  661. transformers/utils/generic.py +20 -0
  662. transformers/utils/import_utils.py +51 -9
  663. transformers/utils/kernel_config.py +71 -18
  664. transformers/utils/quantization_config.py +8 -8
  665. transformers/video_processing_utils.py +16 -12
  666. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
  667. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
  668. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
  669. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  670. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
  671. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
transformers/__init__.py CHANGED
@@ -18,7 +18,7 @@
18
18
  # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
19
19
  # in the namespace without actually importing anything (and especially none of the backends).
20
20
 
21
- __version__ = "5.0.0rc1"
21
+ __version__ = "5.0.0rc2"
22
22
 
23
23
  import importlib
24
24
  import sys
@@ -36,6 +36,7 @@ from .utils import (
36
36
  is_librosa_available,
37
37
  is_mistral_common_available,
38
38
  is_mlx_available,
39
+ is_numba_available,
39
40
  is_pretty_midi_available,
40
41
  )
41
42
 
@@ -266,6 +267,7 @@ _import_structure = {
266
267
  ],
267
268
  "video_utils": [],
268
269
  "utils.kernel_config": ["KernelConfig"],
270
+ "utils.import_utils": ["requires_backends"],
269
271
  }
270
272
 
271
273
  # tokenizers-backed objects
@@ -439,6 +441,15 @@ else:
439
441
  "convert_and_export_with_cache",
440
442
  ]
441
443
 
444
+ _import_structure["core_model_loading"] = [
445
+ "Chunk",
446
+ "Concatenate",
447
+ "ConversionOps",
448
+ "MergeModulelist",
449
+ "PermuteForRope",
450
+ "SplitModulelist",
451
+ "WeightConverter",
452
+ ]
442
453
  _import_structure["modeling_flash_attention_utils"] = []
443
454
  _import_structure["modeling_layers"] = ["GradientCheckpointingLayer"]
444
455
  _import_structure["modeling_outputs"] = []
@@ -492,6 +503,13 @@ if TYPE_CHECKING:
492
503
  from .configuration_utils import PretrainedConfig as PretrainedConfig
493
504
  from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS as SLOW_TO_FAST_CONVERTERS
494
505
  from .convert_slow_tokenizer import convert_slow_tokenizer as convert_slow_tokenizer
506
+ from .core_model_loading import Chunk as Chunk
507
+ from .core_model_loading import Concatenate as Concatenate
508
+ from .core_model_loading import ConversionOps as ConversionOps
509
+ from .core_model_loading import MergeModulelist as MergeModulelist
510
+ from .core_model_loading import PermuteForRope as PermuteForRope
511
+ from .core_model_loading import SplitModulelist as SplitModulelist
512
+ from .core_model_loading import WeightConverter as WeightConverter
495
513
 
496
514
  # Data
497
515
  from .data import DataProcessor as DataProcessor
@@ -750,6 +768,7 @@ if TYPE_CHECKING:
750
768
  from .utils import is_torch_npu_available as is_torch_npu_available
751
769
  from .utils import is_torch_xla_available as is_torch_xla_available
752
770
  from .utils import is_torch_xpu_available as is_torch_xpu_available
771
+ from .utils.import_utils import requires_backends
753
772
  from .utils.kernel_config import KernelConfig as KernelConfig
754
773
 
755
774
  # Quantization config
@@ -205,7 +205,7 @@ class LaplaceActivation(nn.Module):
205
205
 
206
206
  class ReLUSquaredActivation(nn.Module):
207
207
  """
208
- Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.08668v2
208
+ Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.08668
209
209
  """
210
210
 
211
211
  def forward(self, input):
@@ -166,7 +166,6 @@ def load_audio_as(
166
166
  - `dict`: Dictionary with 'data' (base64 encoded audio data) and 'format' keys (if return_format="dict")
167
167
  - `io.BytesIO`: BytesIO object containing audio data (if return_format="buffer")
168
168
  """
169
- # TODO: @eustlb, we actually don't need librosa but soxr is installed with librosa
170
169
  requires_backends(load_audio_as, ["librosa"])
171
170
 
172
171
  if return_format not in ["base64", "dict", "buffer"]:
@@ -37,7 +37,7 @@ class CacheLayerMixin(ABC):
37
37
  return f"{self.__class__.__name__}"
38
38
 
39
39
  @abstractmethod
40
- def lazy_initialization(self, key_states: torch.Tensor): ...
40
+ def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None: ...
41
41
 
42
42
  @abstractmethod
43
43
  def update(
@@ -89,7 +89,7 @@ class DynamicLayer(CacheLayerMixin):
89
89
 
90
90
  is_sliding = False
91
91
 
92
- def lazy_initialization(self, key_states: torch.Tensor):
92
+ def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
93
93
  self.dtype, self.device = key_states.dtype, key_states.device
94
94
  self.keys = torch.tensor([], dtype=self.dtype, device=self.device)
95
95
  self.values = torch.tensor([], dtype=self.dtype, device=self.device)
@@ -114,7 +114,7 @@ class DynamicLayer(CacheLayerMixin):
114
114
  """
115
115
  # Lazy initialization
116
116
  if not self.is_initialized:
117
- self.lazy_initialization(key_states)
117
+ self.lazy_initialization(key_states, value_states)
118
118
 
119
119
  self.keys = torch.cat([self.keys, key_states], dim=-2)
120
120
  self.values = torch.cat([self.values, value_states], dim=-2)
@@ -178,8 +178,8 @@ class DynamicSlidingWindowLayer(DynamicLayer):
178
178
  self.cumulative_length = 0
179
179
  self._sliding_window_tensor = torch.tensor(self.sliding_window, dtype=torch.long)
180
180
 
181
- def lazy_initialization(self, key_states: torch.Tensor) -> None:
182
- super().lazy_initialization(key_states)
181
+ def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
182
+ super().lazy_initialization(key_states, value_states)
183
183
  self._sliding_window_tensor = self._sliding_window_tensor.to(self.device)
184
184
 
185
185
  def update(
@@ -201,7 +201,7 @@ class DynamicSlidingWindowLayer(DynamicLayer):
201
201
  """
202
202
  # Lazy initialization
203
203
  if not self.is_initialized:
204
- self.lazy_initialization(key_states)
204
+ self.lazy_initialization(key_states, value_states)
205
205
 
206
206
  self.cumulative_length += key_states.shape[-2]
207
207
 
@@ -267,7 +267,7 @@ class StaticLayer(CacheLayerMixin):
267
267
  super().__init__()
268
268
  self.max_cache_len = max_cache_len
269
269
 
270
- def lazy_initialization(self, key_states: torch.Tensor):
270
+ def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
271
271
  """
272
272
  Lazy initialization of the keys and values tensors. This allows to get all properties (dtype, device,
273
273
  num_heads in case of TP etc...) at runtime directly, which is extremely practical as it avoids moving
@@ -281,16 +281,18 @@ class StaticLayer(CacheLayerMixin):
281
281
  i.e. `mode="reduce-overhead"` is known to fail). But it will in general work correctly, and prefill should
282
282
  not be compiled anyway for performances!
283
283
  """
284
- self.max_batch_size, self.num_heads, _, self.head_dim = key_states.shape
285
284
  self.dtype, self.device = key_states.dtype, key_states.device
285
+ self.max_batch_size, self.num_heads = key_states.shape[:2]
286
+ self.v_head_dim = value_states.shape[-1]
287
+ self.k_head_dim = key_states.shape[-1]
286
288
 
287
289
  self.keys = torch.zeros(
288
- (self.max_batch_size, self.num_heads, self.max_cache_len, self.head_dim),
290
+ (self.max_batch_size, self.num_heads, self.max_cache_len, self.k_head_dim),
289
291
  dtype=self.dtype,
290
292
  device=self.device,
291
293
  )
292
294
  self.values = torch.zeros(
293
- (self.max_batch_size, self.num_heads, self.max_cache_len, self.head_dim),
295
+ (self.max_batch_size, self.num_heads, self.max_cache_len, self.v_head_dim),
294
296
  dtype=self.dtype,
295
297
  device=self.device,
296
298
  )
@@ -323,7 +325,7 @@ class StaticLayer(CacheLayerMixin):
323
325
  """
324
326
  # Lazy initialization
325
327
  if not self.is_initialized:
326
- self.lazy_initialization(key_states)
328
+ self.lazy_initialization(key_states, value_states)
327
329
 
328
330
  # Some old models give None for `cache_position` or even omit passing `cache_kwargs` when used as cross-attention,
329
331
  # in which case we should copy the whole Layer (key_states.shape[-2] == self.max_cache_len)
@@ -398,7 +400,7 @@ class StaticSlidingWindowLayer(StaticLayer):
398
400
  """
399
401
  # Lazy initialization
400
402
  if not self.is_initialized:
401
- self.lazy_initialization(key_states)
403
+ self.lazy_initialization(key_states, value_states)
402
404
 
403
405
  # Some old models give None for `cache_position` or even omit passing `cache_kwargs` when used as cross-attention,
404
406
  # in which case we should copy the whole Layer (key_states.shape[-2] == self.max_cache_len)
@@ -533,7 +535,7 @@ class QuantizedLayer(DynamicLayer):
533
535
 
534
536
  # Lazy initialization
535
537
  if not self.is_initialized:
536
- self.lazy_initialization(key_states)
538
+ self.lazy_initialization(key_states, value_states)
537
539
  self._quantized_keys = self._quantize(key_states.contiguous(), axis=self.axis_key)
538
540
  self._quantized_values = self._quantize(value_states.contiguous(), axis=self.axis_value)
539
541
  return key_states, value_states
@@ -795,10 +797,10 @@ class Cache:
795
797
  # Note that the initialization needs all dimensions (except -2), as well as device and dtype, so we use
796
798
  # this fake tensor approach. It has size 0 on the -2 dimension, so it does not allocate any data (it only
797
799
  # creates an empty tensor with correct shape, dtype and device), which is very efficient and practical
798
- fake_keys_tensor = torch.zeros((batch_size, num_heads, 0, head_dim), dtype=dtype, device=device)
800
+ fake_kv_tensor = torch.zeros((batch_size, num_heads, 0, head_dim), dtype=dtype, device=device)
799
801
  # Init all layers
800
802
  for layer in self.layers:
801
- layer.lazy_initialization(fake_keys_tensor)
803
+ layer.lazy_initialization(fake_kv_tensor, fake_kv_tensor)
802
804
 
803
805
  def get_seq_length(self, layer_idx: int = 0) -> int:
804
806
  """Returns the sequence length of the cache for the given layer."""
@@ -16,6 +16,7 @@
16
16
 
17
17
  import copy
18
18
  import json
19
+ import math
19
20
  import os
20
21
  import warnings
21
22
  from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
@@ -25,6 +26,7 @@ from packaging import version
25
26
 
26
27
  from . import __version__
27
28
  from .dynamic_module_utils import custom_object_save
29
+ from .generation.configuration_utils import GenerationConfig
28
30
  from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
29
31
  from .modeling_rope_utils import RotaryEmbeddingConfigMixin
30
32
  from .utils import (
@@ -49,6 +51,9 @@ logger = logging.get_logger(__name__)
49
51
  # type hinting: specifying the type of config class that inherits from PreTrainedConfig
50
52
  SpecificPreTrainedConfigType = TypeVar("SpecificPreTrainedConfigType", bound="PreTrainedConfig")
51
53
 
54
+ _FLOAT_TAG_KEY = "__float__"
55
+ _FLOAT_TAG_VALUES = {"Infinity": float("inf"), "-Infinity": float("-inf"), "NaN": float("nan")}
56
+
52
57
 
53
58
  class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
54
59
  # no-format
@@ -120,9 +125,6 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
120
125
  Whether cross-attention layers should be added to the model. Note, this option is only relevant for models
121
126
  that can be used as decoder models within the [`EncoderDecoderModel`] class, which consists of all models
122
127
  in `AUTO_MODELS_FOR_CAUSAL_LM`.
123
- tie_encoder_decoder (`bool`, *optional*, defaults to `False`):
124
- Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder
125
- and decoder model to have the exact same parameter names.
126
128
  chunk_size_feed_forward (`int`, *optional*, defaults to `0`):
127
129
  The chunk size of all feed forward layers in the residual attention blocks. A chunk size of `0` means that
128
130
  the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes `n` <
@@ -212,7 +214,6 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
212
214
  is_decoder: bool = False,
213
215
  cross_attention_hidden_size: Optional[int] = None,
214
216
  add_cross_attention: bool = False,
215
- tie_encoder_decoder: bool = False,
216
217
  # Fine-tuning task arguments
217
218
  architectures: Optional[list[str]] = None,
218
219
  finetuning_task: Optional[str] = None,
@@ -276,6 +277,10 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
276
277
  self._output_attentions = output_attentions # has public property
277
278
 
278
279
  # Less common kwargs, only used by some models
280
+ if "tie_encoder_decoder" in kwargs:
281
+ tie_encoder_decoder = kwargs.pop("tie_encoder_decoder")
282
+ tie_word_embeddings = tie_encoder_decoder or tie_word_embeddings
283
+
279
284
  self.tie_word_embeddings = tie_word_embeddings
280
285
  self.chunk_size_feed_forward = chunk_size_feed_forward
281
286
 
@@ -284,7 +289,6 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
284
289
  self.is_decoder = is_decoder # used in encoder-decoder models to differentiate encoder from decoder
285
290
  self.cross_attention_hidden_size = cross_attention_hidden_size
286
291
  self.add_cross_attention = add_cross_attention
287
- self.tie_encoder_decoder = tie_encoder_decoder
288
292
 
289
293
  # Fine-tuning task attributes
290
294
  self.architectures = architectures
@@ -310,7 +314,7 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
310
314
  self.decoder_start_token_id = decoder_start_token_id
311
315
 
312
316
  # Parameters for sequence generation saved in the config are popped instead of loading them.
313
- for parameter_name in self._get_global_generation_defaults().keys():
317
+ for parameter_name in GenerationConfig._get_default_generation_params().keys():
314
318
  kwargs.pop(parameter_name, None)
315
319
 
316
320
  # Name or path to the pretrained checkpoint
@@ -320,6 +324,9 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
320
324
  # Attention implementation to use, if relevant (it sets it recursively on sub-configs)
321
325
  self._attn_implementation = kwargs.pop("attn_implementation", None)
322
326
 
327
+ # Experts implementation to use, if relevant (it sets it recursively on sub-configs)
328
+ self._experts_implementation = kwargs.pop("experts_implementation", None)
329
+
323
330
  # Drop the transformers version info
324
331
  self.transformers_version = kwargs.pop("transformers_version", None)
325
332
 
@@ -413,6 +420,28 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
413
420
  )
414
421
  subconfig._attn_implementation = sub_implementation
415
422
 
423
+ @property
424
+ def _experts_implementation(self):
425
+ return self._experts_implementation_internal
426
+
427
+ @_experts_implementation.setter
428
+ def _experts_implementation(self, value: str | dict | None):
429
+ """We set it recursively on the sub-configs as well"""
430
+ # Set if for current config
431
+ current_moe = getattr(self, "_experts_implementation", None)
432
+ experts_implementation = value if not isinstance(value, dict) else value.get("", current_moe)
433
+ self._experts_implementation_internal = experts_implementation
434
+
435
+ # Set it recursively on the subconfigs
436
+ for subconfig_key in self.sub_configs:
437
+ subconfig = getattr(self, subconfig_key, None)
438
+ if subconfig is not None:
439
+ current_subconfig_moe = getattr(subconfig, "_experts_implementation", None)
440
+ sub_implementation = (
441
+ value if not isinstance(value, dict) else value.get(subconfig_key, current_subconfig_moe)
442
+ )
443
+ subconfig._experts_implementation = sub_implementation
444
+
416
445
  @property
417
446
  def torch_dtype(self):
418
447
  logger.warning_once("`torch_dtype` is deprecated! Use `dtype` instead!")
@@ -449,13 +478,11 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
449
478
  if os.path.isfile(save_directory):
450
479
  raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
451
480
 
452
- non_default_generation_parameters = self._get_non_default_generation_parameters()
453
- if len(non_default_generation_parameters) > 0:
481
+ generation_parameters = self._get_generation_parameters()
482
+ if len(generation_parameters) > 0:
454
483
  raise ValueError(
455
- "Some non-default generation parameters are set in the model config. These should go into either a) "
456
- "`model.generation_config` (as opposed to `model.config`); OR b) a GenerationConfig file "
457
- "(https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model)."
458
- f"\nNon-default generation parameters: {str(non_default_generation_parameters)}",
484
+ "Some generation parameters are set in the model config. These should go into `model.generation_config`"
485
+ f"as opposed to `model.config`. \nGeneration parameters found: {str(generation_parameters)}",
459
486
  )
460
487
 
461
488
  os.makedirs(save_directory, exist_ok=True)
@@ -754,8 +781,9 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
754
781
  # If both are present, use `dtype`
755
782
  kwargs["dtype"] = kwargs.get("dtype", torch_dtype)
756
783
 
757
- # We remove it from kwargs so that it does not appear in `return_unused_kwargs`.
784
+ # We remove them from kwargs so that they do not appear in `return_unused_kwargs`.
758
785
  config_dict["attn_implementation"] = kwargs.pop("attn_implementation", None)
786
+ config_dict["experts_implementation"] = kwargs.pop("experts_implementation", None)
759
787
 
760
788
  config = cls(**config_dict)
761
789
 
@@ -813,7 +841,56 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
813
841
  def _dict_from_json_file(cls, json_file: str | os.PathLike):
814
842
  with open(json_file, encoding="utf-8") as reader:
815
843
  text = reader.read()
816
- return json.loads(text)
844
+ config_dict = json.loads(text)
845
+
846
+ return cls._decode_special_floats(config_dict)
847
+
848
+ @classmethod
849
+ def _encode_special_floats(cls, obj: Any) -> Any:
850
+ """
851
+ Iterates over the passed object and encode specific floats that cannot be JSON-serialized. Python's JSON
852
+ engine saves floats like `Infinity` (+/-) or `NaN` which are not compatible with other JSON engines.
853
+
854
+ It serializes floats like `Infinity` as an object: `{'__float__': Infinity}`.
855
+ """
856
+ if isinstance(obj, float):
857
+ if math.isnan(obj):
858
+ return {_FLOAT_TAG_KEY: "NaN"}
859
+ if obj == float("inf"):
860
+ return {_FLOAT_TAG_KEY: "Infinity"}
861
+ if obj == float("-inf"):
862
+ return {_FLOAT_TAG_KEY: "-Infinity"}
863
+ return obj
864
+
865
+ if isinstance(obj, dict):
866
+ return {k: cls._encode_special_floats(v) for k, v in obj.items()}
867
+
868
+ if isinstance(obj, (list, tuple)):
869
+ return [cls._encode_special_floats(v) for v in obj]
870
+
871
+ return obj
872
+
873
+ @classmethod
874
+ def _decode_special_floats(cls, obj: Any) -> Any:
875
+ """
876
+ Iterates over the passed object and decode specific floats that cannot be JSON-serialized. Python's JSON
877
+ engine saves floats like `Infinity` (+/-) or `NaN` which are not compatible with other JSON engines.
878
+
879
+ This method deserializes objects like `{'__float__': Infinity}` to their float values like `Infinity`.
880
+ """
881
+ if isinstance(obj, dict):
882
+ if set(obj.keys()) == {_FLOAT_TAG_KEY} and isinstance(obj[_FLOAT_TAG_KEY], str):
883
+ tag = obj[_FLOAT_TAG_KEY]
884
+ if tag in _FLOAT_TAG_VALUES:
885
+ return _FLOAT_TAG_VALUES[tag]
886
+ return obj
887
+
888
+ return {k: cls._decode_special_floats(v) for k, v in obj.items()}
889
+
890
+ if isinstance(obj, list):
891
+ return [cls._decode_special_floats(v) for v in obj]
892
+
893
+ return obj
817
894
 
818
895
  def __eq__(self, other):
819
896
  return isinstance(other, PreTrainedConfig) and (self.__dict__ == other.__dict__)
@@ -933,6 +1010,10 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
933
1010
  config_dict = self.to_diff_dict()
934
1011
  else:
935
1012
  config_dict = self.to_dict()
1013
+
1014
+ # Handle +/-Infinity and NaNs
1015
+ config_dict = self._encode_special_floats(config_dict)
1016
+
936
1017
  return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
937
1018
 
938
1019
  def to_json_file(self, json_file_path: str | os.PathLike, use_diff: bool = True):
@@ -1019,10 +1100,6 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
1019
1100
  Checks and removes if there are any keys in the dict that should not be serialized when saving the config.
1020
1101
  Runs recursive check on the dict, to remove from all sub configs.
1021
1102
  """
1022
- if hasattr(self, "quantization_config"):
1023
- # Pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
1024
- _ = d.pop("_pre_quantization_dtype", None)
1025
-
1026
1103
  if "_auto_class" in d:
1027
1104
  del d["_auto_class"]
1028
1105
  if "_output_attentions" in d:
@@ -1031,6 +1108,8 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
1031
1108
  del d["_commit_hash"]
1032
1109
  if "_attn_implementation_internal" in d:
1033
1110
  del d["_attn_implementation_internal"]
1111
+ if "_experts_implementation_internal" in d:
1112
+ del d["_experts_implementation_internal"]
1034
1113
  # Do not serialize `base_model_tp_plan` for now
1035
1114
  if "base_model_tp_plan" in d:
1036
1115
  del d["base_model_tp_plan"]
@@ -1063,58 +1142,17 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
1063
1142
 
1064
1143
  cls._auto_class = auto_class
1065
1144
 
1066
- @staticmethod
1067
- def _get_global_generation_defaults() -> dict[str, Any]:
1068
- return {
1069
- "max_length": 20,
1070
- "min_length": 0,
1071
- "do_sample": False,
1072
- "early_stopping": False,
1073
- "num_beams": 1,
1074
- "temperature": 1.0,
1075
- "top_k": 50,
1076
- "top_p": 1.0,
1077
- "typical_p": 1.0,
1078
- "repetition_penalty": 1.0,
1079
- "length_penalty": 1.0,
1080
- "no_repeat_ngram_size": 0,
1081
- "encoder_no_repeat_ngram_size": 0,
1082
- "bad_words_ids": None,
1083
- "num_return_sequences": 1,
1084
- "output_scores": False,
1085
- "return_dict_in_generate": False,
1086
- "forced_bos_token_id": None,
1087
- "forced_eos_token_id": None,
1088
- "remove_invalid_values": False,
1089
- "exponential_decay_length_penalty": None,
1090
- "suppress_tokens": None,
1091
- "begin_suppress_tokens": None,
1092
- # Deprecated arguments (moved to the Hub). TODO joao, manuel: remove in v4.62.0
1093
- "num_beam_groups": 1,
1094
- "diversity_penalty": 0.0,
1095
- }
1096
-
1097
- def _get_non_default_generation_parameters(self) -> dict[str, Any]:
1145
+ def _get_generation_parameters(self) -> dict[str, Any]:
1098
1146
  """
1099
1147
  Gets the non-default generation parameters on the PreTrainedConfig instance
1100
1148
  """
1101
- non_default_generation_parameters = {}
1102
- decoder_attribute_name = None
1103
-
1104
- # If it is a composite model, we want to check the subconfig that will be used for generation
1105
- self_decoder_config = self if decoder_attribute_name is None else getattr(self, decoder_attribute_name)
1106
-
1107
- for parameter_name, default_global_value in self._get_global_generation_defaults().items():
1108
- if hasattr(self_decoder_config, parameter_name):
1109
- parameter_value = getattr(self_decoder_config, parameter_name, None)
1110
- # Two cases in which is okay for the model config to hold generation config parameters:
1111
- # 1. The parameter is set to `None`, effectively delegating its value to the generation config
1112
- # 2. The parameter is set the global generation defaults
1113
- if parameter_value is None or parameter_value == default_global_value:
1114
- continue
1115
- non_default_generation_parameters[parameter_name] = getattr(self_decoder_config, parameter_name)
1149
+ generation_params = {}
1150
+ default_config = self.__class__().to_dict() if not self.has_no_defaults_at_init else {}
1151
+ for key in GenerationConfig._get_default_generation_params().keys():
1152
+ if hasattr(self, key) and getattr(self, key) is not None and key not in default_config:
1153
+ generation_params[key] = getattr(self, key)
1116
1154
 
1117
- return non_default_generation_parameters
1155
+ return generation_params
1118
1156
 
1119
1157
  def get_text_config(self, decoder=None, encoder=None) -> "PreTrainedConfig":
1120
1158
  """
@@ -1255,18 +1293,24 @@ if PreTrainedConfig.push_to_hub.__doc__ is not None:
1255
1293
  PretrainedConfig = PreTrainedConfig
1256
1294
 
1257
1295
 
1258
- ALLOWED_LAYER_TYPES = (
1296
+ ALLOWED_ATTENTION_LAYER_TYPES = (
1259
1297
  "full_attention",
1260
1298
  "sliding_attention",
1261
1299
  "chunked_attention",
1262
1300
  "linear_attention", # used in minimax
1263
1301
  )
1264
1302
 
1303
+ ALLOWED_MLP_LAYER_TYPES = (
1304
+ "sparse",
1305
+ "dense",
1306
+ )
1307
+
1265
1308
 
1266
- def layer_type_validation(layer_types: list[str], num_hidden_layers: Optional[int] = None):
1309
+ def layer_type_validation(layer_types: list[str], num_hidden_layers: Optional[int] = None, attention: bool = True):
1267
1310
  """Check that `layer_types` is correctly defined."""
1268
- if not all(layer_type in ALLOWED_LAYER_TYPES for layer_type in layer_types):
1269
- raise ValueError(f"The `layer_types` entries must be in {ALLOWED_LAYER_TYPES}")
1311
+ allowed_layer_types = ALLOWED_ATTENTION_LAYER_TYPES if attention else ALLOWED_MLP_LAYER_TYPES
1312
+ if not all(layer_type in allowed_layer_types for layer_type in layer_types):
1313
+ raise ValueError(f"The `layer_types` entries must be in {allowed_layer_types}")
1270
1314
  if num_hidden_layers is not None and num_hidden_layers != len(layer_types):
1271
1315
  raise ValueError(
1272
1316
  f"`num_hidden_layers` ({num_hidden_layers}) must be equal to the number of layer types "
@@ -18,7 +18,15 @@ from __future__ import annotations
18
18
  from copy import deepcopy
19
19
  from typing import TYPE_CHECKING
20
20
 
21
- from .core_model_loading import Concatenate, MergeModulelist, WeightConverter, WeightRenaming
21
+ from .core_model_loading import (
22
+ Chunk,
23
+ Concatenate,
24
+ ErnieFuseAndSplitTextVisionExperts,
25
+ MergeModulelist,
26
+ Transpose,
27
+ WeightConverter,
28
+ WeightRenaming,
29
+ )
22
30
  from .utils import is_torch_available
23
31
 
24
32
 
@@ -105,6 +113,57 @@ def _build_checkpoint_conversion_mapping():
105
113
  operations=[MergeModulelist(dim=0)],
106
114
  ),
107
115
  ],
116
+ "ernie4_5_vl_moe": [
117
+ # vision
118
+ WeightRenaming("vision_model", "vision_tower"),
119
+ # resampler
120
+ WeightRenaming("spatial_linear.0", "spatial_linear.fc1"),
121
+ WeightRenaming("spatial_linear.2", "spatial_linear.fc2"),
122
+ WeightRenaming("spatial_linear.3", "spatial_linear.ln"),
123
+ WeightRenaming("temporal_linear.0", "temporal_linear.fc1"),
124
+ WeightRenaming("temporal_linear.2", "temporal_linear.fc2"),
125
+ WeightRenaming("temporal_linear.3", "temporal_linear.ln"),
126
+ # language model
127
+ WeightRenaming(r"(?<!language_model\.)embed_tokens", "language_model.embed_tokens"),
128
+ WeightRenaming(r"(?<!language_model\.)layers", "language_model.layers"),
129
+ WeightConverter(
130
+ source_patterns="mlp.gate.weight_1",
131
+ target_patterns="mlp.vision_moe.gate.weight",
132
+ operations=[Transpose(dim0=0, dim1=1)],
133
+ ),
134
+ WeightConverter(
135
+ source_patterns="mlp.gate.weight",
136
+ target_patterns="mlp.text_moe.gate.weight",
137
+ operations=[Transpose(dim0=0, dim1=1)],
138
+ ),
139
+ WeightConverter(
140
+ source_patterns=["mlp.moe_statics.e_score_correction_bias"],
141
+ target_patterns=[
142
+ "mlp.text_moe.gate.moe_statics.e_score_correction_bias",
143
+ "mlp.vision_moe.gate.moe_statics.e_score_correction_bias",
144
+ ],
145
+ operations=[Chunk(dim=0)],
146
+ ),
147
+ WeightConverter(
148
+ source_patterns=["experts.*.down_proj.weight"],
149
+ target_patterns=[
150
+ "text_moe.experts.down_proj",
151
+ "vision_moe.experts.down_proj",
152
+ ],
153
+ operations=[ErnieFuseAndSplitTextVisionExperts(stack_dim=0, concat_dim=1)],
154
+ ),
155
+ WeightConverter(
156
+ source_patterns=[
157
+ "experts.*.gate_proj.weight",
158
+ "experts.*.up_proj.weight",
159
+ ],
160
+ target_patterns=[
161
+ "text_moe.experts.gate_up_proj",
162
+ "vision_moe.experts.gate_up_proj",
163
+ ],
164
+ operations=[ErnieFuseAndSplitTextVisionExperts(stack_dim=0, concat_dim=1)],
165
+ ),
166
+ ],
108
167
  "jamba": [
109
168
  WeightConverter(
110
169
  source_patterns=[
@@ -142,12 +201,12 @@ def _build_checkpoint_conversion_mapping():
142
201
  if hasattr(torch.nn.utils.parametrizations, "weight_norm"):
143
202
  mapping["legacy"] += [
144
203
  WeightRenaming(
145
- source_patterns="weight_g",
146
- target_patterns="parametrizations.weight.original0",
204
+ source_patterns=".weight_g$",
205
+ target_patterns=".parametrizations.weight.original0",
147
206
  ),
148
207
  WeightRenaming(
149
- source_patterns="weight_v",
150
- target_patterns="parametrizations.weight.original1",
208
+ source_patterns=".weight_v$",
209
+ target_patterns=".parametrizations.weight.original1",
151
210
  ),
152
211
  ]
153
212
  else:
@@ -166,6 +225,9 @@ def _build_checkpoint_conversion_mapping():
166
225
  mapping["deepseek_v3"] = mapping["qwen2_moe"].copy()
167
226
  mapping["dots1"] = mapping["qwen2_moe"].copy()
168
227
  mapping["ernie4_5_moe"] = mapping["qwen2_moe"].copy()
228
+ mapping["ernie4_5_moe"] += [
229
+ WeightRenaming("mlp.moe_statics.e_score_correction_bias", "mlp.gate.moe_statics.e_score_correction_bias")
230
+ ]
169
231
  mapping["glm4_moe"] = mapping["qwen2_moe"].copy()
170
232
  mapping["glm4v_moe"] = mapping["qwen2_moe"].copy()
171
233
  mapping["longcat_flash"] = mapping["qwen2_moe"].copy()
@@ -226,6 +288,7 @@ VLMS = [
226
288
  "sam3_tracker",
227
289
  "sam3_tracker_video",
228
290
  "paddleocrvl",
291
+ "ernie4_5_vl_moe",
229
292
  ]
230
293
 
231
294