transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (671) hide show
  1. transformers/__init__.py +20 -1
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/configuration_utils.py +114 -70
  6. transformers/conversion_mapping.py +68 -5
  7. transformers/core_model_loading.py +201 -35
  8. transformers/dependency_versions_table.py +1 -1
  9. transformers/feature_extraction_utils.py +54 -22
  10. transformers/generation/candidate_generator.py +79 -31
  11. transformers/generation/configuration_utils.py +162 -122
  12. transformers/generation/continuous_batching/cache.py +47 -18
  13. transformers/generation/continuous_batching/cache_manager.py +131 -34
  14. transformers/generation/continuous_batching/continuous_api.py +101 -64
  15. transformers/generation/continuous_batching/requests.py +28 -1
  16. transformers/generation/continuous_batching/scheduler.py +11 -4
  17. transformers/generation/stopping_criteria.py +1 -1
  18. transformers/generation/utils.py +108 -110
  19. transformers/generation/watermarking.py +8 -5
  20. transformers/image_processing_base.py +2 -12
  21. transformers/image_processing_utils_fast.py +15 -4
  22. transformers/initialization.py +37 -0
  23. transformers/integrations/__init__.py +12 -0
  24. transformers/integrations/accelerate.py +44 -111
  25. transformers/integrations/aqlm.py +3 -5
  26. transformers/integrations/awq.py +2 -5
  27. transformers/integrations/bitnet.py +5 -8
  28. transformers/integrations/bitsandbytes.py +16 -15
  29. transformers/integrations/deepspeed.py +18 -3
  30. transformers/integrations/eetq.py +3 -5
  31. transformers/integrations/fbgemm_fp8.py +1 -1
  32. transformers/integrations/finegrained_fp8.py +6 -16
  33. transformers/integrations/flash_attention.py +2 -2
  34. transformers/integrations/higgs.py +2 -5
  35. transformers/integrations/hub_kernels.py +23 -5
  36. transformers/integrations/integration_utils.py +35 -0
  37. transformers/integrations/mistral.py +12 -0
  38. transformers/integrations/moe.py +240 -0
  39. transformers/integrations/mxfp4.py +4 -10
  40. transformers/integrations/peft.py +5 -0
  41. transformers/integrations/quanto.py +5 -2
  42. transformers/integrations/spqr.py +3 -5
  43. transformers/integrations/tensor_parallel.py +167 -221
  44. transformers/integrations/vptq.py +3 -5
  45. transformers/modeling_gguf_pytorch_utils.py +66 -19
  46. transformers/modeling_rope_utils.py +78 -81
  47. transformers/modeling_utils.py +583 -503
  48. transformers/models/__init__.py +19 -0
  49. transformers/models/afmoe/modeling_afmoe.py +7 -16
  50. transformers/models/afmoe/modular_afmoe.py +5 -13
  51. transformers/models/aimv2/modeling_aimv2.py +4 -0
  52. transformers/models/aimv2/modular_aimv2.py +4 -0
  53. transformers/models/albert/modeling_albert.py +3 -0
  54. transformers/models/align/modeling_align.py +12 -6
  55. transformers/models/altclip/modeling_altclip.py +7 -3
  56. transformers/models/apertus/modeling_apertus.py +4 -2
  57. transformers/models/apertus/modular_apertus.py +4 -1
  58. transformers/models/arcee/modeling_arcee.py +1 -1
  59. transformers/models/aria/modeling_aria.py +8 -4
  60. transformers/models/aria/modular_aria.py +7 -3
  61. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  62. transformers/models/auto/auto_factory.py +1 -1
  63. transformers/models/auto/configuration_auto.py +27 -0
  64. transformers/models/auto/feature_extraction_auto.py +7 -3
  65. transformers/models/auto/image_processing_auto.py +4 -2
  66. transformers/models/auto/modeling_auto.py +31 -0
  67. transformers/models/auto/processing_auto.py +4 -0
  68. transformers/models/auto/tokenization_auto.py +132 -153
  69. transformers/models/auto/video_processing_auto.py +5 -2
  70. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  71. transformers/models/bamba/modeling_bamba.py +18 -19
  72. transformers/models/bamba/modular_bamba.py +17 -16
  73. transformers/models/bark/modeling_bark.py +9 -0
  74. transformers/models/bart/configuration_bart.py +0 -1
  75. transformers/models/bart/modeling_bart.py +7 -0
  76. transformers/models/beit/image_processing_beit_fast.py +0 -1
  77. transformers/models/bert/modeling_bert.py +3 -0
  78. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  79. transformers/models/big_bird/modeling_big_bird.py +3 -0
  80. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
  81. transformers/models/bit/modeling_bit.py +5 -1
  82. transformers/models/bitnet/modeling_bitnet.py +1 -1
  83. transformers/models/blenderbot/modeling_blenderbot.py +7 -0
  84. transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
  85. transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
  86. transformers/models/blip/modeling_blip.py +2 -0
  87. transformers/models/blip/modeling_blip_text.py +8 -0
  88. transformers/models/blip_2/modeling_blip_2.py +2 -0
  89. transformers/models/bloom/modeling_bloom.py +13 -44
  90. transformers/models/blt/modeling_blt.py +162 -2
  91. transformers/models/blt/modular_blt.py +168 -3
  92. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  93. transformers/models/bridgetower/modeling_bridgetower.py +6 -0
  94. transformers/models/bros/modeling_bros.py +8 -0
  95. transformers/models/camembert/modeling_camembert.py +109 -106
  96. transformers/models/canine/modeling_canine.py +6 -0
  97. transformers/models/canine/tokenization_canine.py +2 -0
  98. transformers/models/chameleon/modeling_chameleon.py +9 -4
  99. transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
  100. transformers/models/clap/feature_extraction_clap.py +2 -2
  101. transformers/models/clap/modeling_clap.py +25 -15
  102. transformers/models/clip/modeling_clip.py +2 -0
  103. transformers/models/clipseg/modeling_clipseg.py +4 -0
  104. transformers/models/clvp/modeling_clvp.py +14 -3
  105. transformers/models/code_llama/tokenization_code_llama.py +1 -1
  106. transformers/models/codegen/modeling_codegen.py +13 -4
  107. transformers/models/cohere/modeling_cohere.py +1 -1
  108. transformers/models/cohere2/modeling_cohere2.py +1 -1
  109. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
  110. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  111. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  112. transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
  113. transformers/models/convbert/modeling_convbert.py +3 -0
  114. transformers/models/convnext/image_processing_convnext.py +2 -2
  115. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  116. transformers/models/csm/generation_csm.py +19 -22
  117. transformers/models/csm/modeling_csm.py +3 -1
  118. transformers/models/csm/modular_csm.py +2 -0
  119. transformers/models/ctrl/modeling_ctrl.py +14 -2
  120. transformers/models/cvt/modeling_cvt.py +5 -1
  121. transformers/models/cwm/modeling_cwm.py +1 -1
  122. transformers/models/d_fine/configuration_d_fine.py +3 -4
  123. transformers/models/d_fine/modeling_d_fine.py +46 -39
  124. transformers/models/d_fine/modular_d_fine.py +15 -4
  125. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  126. transformers/models/dab_detr/modeling_dab_detr.py +1 -1
  127. transformers/models/dac/modeling_dac.py +4 -4
  128. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  129. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  130. transformers/models/dbrx/configuration_dbrx.py +9 -1
  131. transformers/models/dbrx/modeling_dbrx.py +1 -1
  132. transformers/models/deberta/modeling_deberta.py +2 -0
  133. transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
  134. transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
  135. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
  136. transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
  137. transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
  138. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
  139. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  140. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  141. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  142. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  143. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  144. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  145. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  146. transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
  147. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  148. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  149. transformers/models/detr/configuration_detr.py +1 -1
  150. transformers/models/detr/modeling_detr.py +8 -1
  151. transformers/models/dia/generation_dia.py +3 -10
  152. transformers/models/dia/modeling_dia.py +12 -1
  153. transformers/models/dia/modular_dia.py +11 -0
  154. transformers/models/dia/processing_dia.py +1 -1
  155. transformers/models/diffllama/modeling_diffllama.py +3 -3
  156. transformers/models/diffllama/modular_diffllama.py +2 -2
  157. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  158. transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
  159. transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
  160. transformers/models/distilbert/modeling_distilbert.py +11 -9
  161. transformers/models/doge/modeling_doge.py +1 -1
  162. transformers/models/donut/image_processing_donut_fast.py +0 -1
  163. transformers/models/donut/modeling_donut_swin.py +16 -12
  164. transformers/models/dots1/modeling_dots1.py +14 -5
  165. transformers/models/dpt/configuration_dpt.py +1 -1
  166. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  167. transformers/models/dpt/modular_dpt.py +1 -2
  168. transformers/models/edgetam/configuration_edgetam.py +1 -1
  169. transformers/models/edgetam/modeling_edgetam.py +5 -2
  170. transformers/models/edgetam/modular_edgetam.py +15 -14
  171. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
  172. transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
  173. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  174. transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
  175. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  176. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  177. transformers/models/efficientnet/modeling_efficientnet.py +5 -1
  178. transformers/models/electra/modeling_electra.py +7 -0
  179. transformers/models/emu3/modeling_emu3.py +8 -2
  180. transformers/models/emu3/modular_emu3.py +7 -1
  181. transformers/models/encodec/modeling_encodec.py +14 -0
  182. transformers/models/eomt/image_processing_eomt_fast.py +46 -14
  183. transformers/models/eomt/modeling_eomt.py +7 -0
  184. transformers/models/eomt/modular_eomt.py +7 -0
  185. transformers/models/ernie/modeling_ernie.py +6 -0
  186. transformers/models/ernie/modular_ernie.py +6 -0
  187. transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
  188. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
  189. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
  190. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  191. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  192. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  193. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  194. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  195. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  196. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  197. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  198. transformers/models/esm/modeling_esm.py +6 -0
  199. transformers/models/esm/modeling_esmfold.py +6 -1
  200. transformers/models/evolla/modeling_evolla.py +9 -1
  201. transformers/models/evolla/modular_evolla.py +8 -0
  202. transformers/models/exaone4/modeling_exaone4.py +1 -1
  203. transformers/models/falcon/modeling_falcon.py +3 -3
  204. transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
  205. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  206. transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
  207. transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
  208. transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
  209. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
  210. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  211. transformers/models/flaubert/modeling_flaubert.py +14 -15
  212. transformers/models/flava/image_processing_flava_fast.py +0 -2
  213. transformers/models/flava/modeling_flava.py +4 -1
  214. transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
  215. transformers/models/florence2/modeling_florence2.py +20 -3
  216. transformers/models/florence2/modular_florence2.py +13 -0
  217. transformers/models/fnet/modeling_fnet.py +7 -0
  218. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  219. transformers/models/fuyu/modeling_fuyu.py +3 -1
  220. transformers/models/fuyu/processing_fuyu.py +16 -0
  221. transformers/models/gemma/modeling_gemma.py +10 -12
  222. transformers/models/gemma/modular_gemma.py +9 -11
  223. transformers/models/gemma2/modeling_gemma2.py +1 -1
  224. transformers/models/gemma2/modular_gemma2.py +1 -1
  225. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  226. transformers/models/gemma3/modeling_gemma3.py +28 -7
  227. transformers/models/gemma3/modular_gemma3.py +26 -6
  228. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  229. transformers/models/gemma3n/modeling_gemma3n.py +47 -9
  230. transformers/models/gemma3n/modular_gemma3n.py +51 -9
  231. transformers/models/git/modeling_git.py +181 -126
  232. transformers/models/glm/modeling_glm.py +1 -1
  233. transformers/models/glm4/modeling_glm4.py +1 -1
  234. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  235. transformers/models/glm46v/modeling_glm46v.py +3 -1
  236. transformers/models/glm46v/modular_glm46v.py +3 -0
  237. transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
  238. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  239. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  240. transformers/models/glm4v/modeling_glm4v.py +15 -5
  241. transformers/models/glm4v/modular_glm4v.py +11 -3
  242. transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
  243. transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
  244. transformers/models/glmasr/__init__.py +30 -0
  245. transformers/models/glmasr/configuration_glmasr.py +197 -0
  246. transformers/models/glmasr/modeling_glmasr.py +512 -0
  247. transformers/models/glmasr/modular_glmasr.py +433 -0
  248. transformers/models/glmasr/processing_glmasr.py +332 -0
  249. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  250. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  251. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  252. transformers/models/gpt2/modeling_gpt2.py +8 -5
  253. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
  254. transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
  255. transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
  256. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
  257. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  258. transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
  259. transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
  260. transformers/models/gptj/modeling_gptj.py +15 -6
  261. transformers/models/granite/modeling_granite.py +1 -1
  262. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  263. transformers/models/granitemoe/modeling_granitemoe.py +2 -3
  264. transformers/models/granitemoe/modular_granitemoe.py +1 -2
  265. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  266. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
  267. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  268. transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
  269. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  270. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
  271. transformers/models/groupvit/modeling_groupvit.py +6 -1
  272. transformers/models/helium/modeling_helium.py +1 -1
  273. transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
  274. transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
  275. transformers/models/hubert/modeling_hubert.py +4 -0
  276. transformers/models/hubert/modular_hubert.py +4 -0
  277. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
  278. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  279. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  280. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
  281. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  282. transformers/models/ibert/modeling_ibert.py +16 -0
  283. transformers/models/idefics/modeling_idefics.py +10 -0
  284. transformers/models/idefics2/modeling_idefics2.py +7 -1
  285. transformers/models/idefics3/modeling_idefics3.py +5 -1
  286. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  287. transformers/models/imagegpt/modeling_imagegpt.py +9 -2
  288. transformers/models/instructblip/modeling_instructblip.py +2 -0
  289. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  290. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  291. transformers/models/internvl/modeling_internvl.py +11 -8
  292. transformers/models/internvl/modular_internvl.py +5 -9
  293. transformers/models/internvl/video_processing_internvl.py +0 -1
  294. transformers/models/jais2/__init__.py +27 -0
  295. transformers/models/jais2/configuration_jais2.py +152 -0
  296. transformers/models/jais2/modeling_jais2.py +486 -0
  297. transformers/models/jais2/modular_jais2.py +196 -0
  298. transformers/models/jamba/modeling_jamba.py +24 -19
  299. transformers/models/jamba/modular_jamba.py +17 -17
  300. transformers/models/janus/image_processing_janus_fast.py +0 -1
  301. transformers/models/janus/modeling_janus.py +15 -7
  302. transformers/models/janus/modular_janus.py +16 -7
  303. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  304. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  305. transformers/models/kosmos2/modeling_kosmos2.py +14 -2
  306. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  307. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  308. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
  309. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  310. transformers/models/lasr/configuration_lasr.py +4 -0
  311. transformers/models/lasr/modeling_lasr.py +3 -2
  312. transformers/models/lasr/modular_lasr.py +8 -1
  313. transformers/models/lasr/processing_lasr.py +0 -2
  314. transformers/models/layoutlm/modeling_layoutlm.py +5 -3
  315. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  316. transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
  317. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
  318. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  319. transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
  320. transformers/models/led/modeling_led.py +6 -0
  321. transformers/models/levit/modeling_levit.py +18 -0
  322. transformers/models/lfm2/modeling_lfm2.py +1 -1
  323. transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
  324. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  325. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  326. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  327. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  328. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  329. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  330. transformers/models/lilt/modeling_lilt.py +19 -15
  331. transformers/models/llama/modeling_llama.py +1 -1
  332. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  333. transformers/models/llama4/modeling_llama4.py +8 -4
  334. transformers/models/llava/image_processing_llava_fast.py +0 -1
  335. transformers/models/llava/modeling_llava.py +12 -7
  336. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  337. transformers/models/llava_next/modeling_llava_next.py +7 -3
  338. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  339. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  340. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  341. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  342. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  343. transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
  344. transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
  345. transformers/models/longt5/modeling_longt5.py +0 -4
  346. transformers/models/m2m_100/modeling_m2m_100.py +10 -0
  347. transformers/models/mamba/modeling_mamba.py +2 -1
  348. transformers/models/mamba2/modeling_mamba2.py +24 -23
  349. transformers/models/marian/configuration_marian.py +1 -1
  350. transformers/models/marian/modeling_marian.py +3 -0
  351. transformers/models/markuplm/modeling_markuplm.py +5 -8
  352. transformers/models/mask2former/configuration_mask2former.py +3 -3
  353. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  354. transformers/models/mask2former/modeling_mask2former.py +9 -0
  355. transformers/models/maskformer/configuration_maskformer.py +3 -3
  356. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  357. transformers/models/maskformer/modeling_maskformer.py +9 -1
  358. transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
  359. transformers/models/mbart/configuration_mbart.py +1 -0
  360. transformers/models/mbart/modeling_mbart.py +7 -0
  361. transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
  362. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  363. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  364. transformers/models/mimi/modeling_mimi.py +25 -4
  365. transformers/models/minimax/modeling_minimax.py +16 -3
  366. transformers/models/minimax/modular_minimax.py +12 -1
  367. transformers/models/ministral/modeling_ministral.py +1 -1
  368. transformers/models/ministral3/modeling_ministral3.py +1 -1
  369. transformers/models/mistral/modeling_mistral.py +1 -1
  370. transformers/models/mistral3/modeling_mistral3.py +10 -4
  371. transformers/models/mistral3/modular_mistral3.py +3 -1
  372. transformers/models/mixtral/modeling_mixtral.py +12 -4
  373. transformers/models/mixtral/modular_mixtral.py +6 -2
  374. transformers/models/mlcd/modeling_mlcd.py +6 -0
  375. transformers/models/mlcd/modular_mlcd.py +4 -0
  376. transformers/models/mllama/modeling_mllama.py +13 -2
  377. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  378. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
  379. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  380. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  381. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  382. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  383. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  384. transformers/models/mobilevit/modeling_mobilevit.py +4 -0
  385. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
  386. transformers/models/modernbert/modeling_modernbert.py +12 -1
  387. transformers/models/modernbert/modular_modernbert.py +12 -1
  388. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
  389. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
  390. transformers/models/moonshine/modeling_moonshine.py +1 -1
  391. transformers/models/moshi/modeling_moshi.py +21 -51
  392. transformers/models/mpnet/modeling_mpnet.py +2 -0
  393. transformers/models/mra/modeling_mra.py +4 -1
  394. transformers/models/mt5/configuration_mt5.py +2 -3
  395. transformers/models/mt5/modeling_mt5.py +0 -10
  396. transformers/models/musicgen/modeling_musicgen.py +5 -9
  397. transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
  398. transformers/models/mvp/modeling_mvp.py +7 -0
  399. transformers/models/nanochat/modeling_nanochat.py +1 -1
  400. transformers/models/nemotron/modeling_nemotron.py +3 -3
  401. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  402. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  403. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  404. transformers/models/nougat/tokenization_nougat.py +11 -16
  405. transformers/models/nystromformer/modeling_nystromformer.py +7 -0
  406. transformers/models/olmo/modeling_olmo.py +1 -1
  407. transformers/models/olmo2/modeling_olmo2.py +1 -1
  408. transformers/models/olmo3/modeling_olmo3.py +1 -1
  409. transformers/models/olmoe/modeling_olmoe.py +12 -4
  410. transformers/models/olmoe/modular_olmoe.py +4 -2
  411. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  412. transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
  413. transformers/models/oneformer/configuration_oneformer.py +3 -3
  414. transformers/models/oneformer/modeling_oneformer.py +7 -38
  415. transformers/models/openai/modeling_openai.py +12 -0
  416. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  417. transformers/models/ovis2/modeling_ovis2.py +15 -3
  418. transformers/models/ovis2/modular_ovis2.py +8 -0
  419. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  420. transformers/models/owlv2/modeling_owlv2.py +7 -3
  421. transformers/models/owlv2/modular_owlv2.py +0 -2
  422. transformers/models/owlvit/modeling_owlvit.py +7 -3
  423. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
  424. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
  425. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
  426. transformers/models/paligemma/modeling_paligemma.py +25 -17
  427. transformers/models/parakeet/modeling_parakeet.py +5 -0
  428. transformers/models/parakeet/modular_parakeet.py +5 -0
  429. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  430. transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
  431. transformers/models/patchtst/modeling_patchtst.py +5 -4
  432. transformers/models/pe_audio/__init__.py +30 -0
  433. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  434. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  435. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  436. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  437. transformers/models/pe_audio/processing_pe_audio.py +24 -0
  438. transformers/models/pe_audio_video/__init__.py +29 -0
  439. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  440. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  441. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  442. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  443. transformers/models/pe_video/__init__.py +30 -0
  444. transformers/models/pe_video/configuration_pe_video.py +211 -0
  445. transformers/models/pe_video/modeling_pe_video.py +636 -0
  446. transformers/models/pe_video/modular_pe_video.py +219 -0
  447. transformers/models/pe_video/processing_pe_video.py +10 -0
  448. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  449. transformers/models/pegasus/configuration_pegasus.py +1 -0
  450. transformers/models/pegasus/modeling_pegasus.py +3 -0
  451. transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
  452. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  453. transformers/models/perceiver/modeling_perceiver.py +5 -1
  454. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  455. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  456. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  457. transformers/models/persimmon/modeling_persimmon.py +1 -1
  458. transformers/models/phi/modeling_phi.py +1 -1
  459. transformers/models/phi3/modeling_phi3.py +1 -1
  460. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
  461. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
  462. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  463. transformers/models/phimoe/modeling_phimoe.py +12 -4
  464. transformers/models/phimoe/modular_phimoe.py +1 -1
  465. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  466. transformers/models/pixio/__init__.py +30 -0
  467. transformers/models/pixio/configuration_pixio.py +151 -0
  468. transformers/models/pixio/modeling_pixio.py +507 -0
  469. transformers/models/pixio/modular_pixio.py +404 -0
  470. transformers/models/pixtral/modeling_pixtral.py +1 -1
  471. transformers/models/pixtral/processing_pixtral.py +3 -1
  472. transformers/models/plbart/configuration_plbart.py +1 -0
  473. transformers/models/plbart/modeling_plbart.py +7 -0
  474. transformers/models/plbart/modular_plbart.py +6 -0
  475. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  476. transformers/models/poolformer/modeling_poolformer.py +11 -1
  477. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  478. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  479. transformers/models/prophetnet/modeling_prophetnet.py +2 -1
  480. transformers/models/qwen2/modeling_qwen2.py +1 -1
  481. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
  482. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
  483. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
  484. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
  485. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
  486. transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
  487. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  488. transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
  489. transformers/models/qwen3/modeling_qwen3.py +1 -1
  490. transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
  491. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
  492. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  493. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
  494. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
  495. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  496. transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
  497. transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
  498. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  499. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
  500. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
  501. transformers/models/rag/configuration_rag.py +0 -8
  502. transformers/models/rag/modeling_rag.py +7 -9
  503. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
  504. transformers/models/reformer/modeling_reformer.py +9 -1
  505. transformers/models/regnet/modeling_regnet.py +4 -0
  506. transformers/models/rembert/modeling_rembert.py +7 -1
  507. transformers/models/resnet/modeling_resnet.py +8 -3
  508. transformers/models/roberta/modeling_roberta.py +3 -0
  509. transformers/models/roberta/modular_roberta.py +3 -0
  510. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  511. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  512. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  513. transformers/models/rt_detr/modeling_rt_detr.py +4 -0
  514. transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
  515. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  516. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
  517. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  518. transformers/models/rwkv/modeling_rwkv.py +1 -1
  519. transformers/models/sam/configuration_sam.py +1 -0
  520. transformers/models/sam/image_processing_sam_fast.py +0 -1
  521. transformers/models/sam/modeling_sam.py +4 -1
  522. transformers/models/sam2/configuration_sam2.py +1 -1
  523. transformers/models/sam2/modeling_sam2.py +5 -1
  524. transformers/models/sam2/modular_sam2.py +5 -1
  525. transformers/models/sam2_video/modeling_sam2_video.py +51 -43
  526. transformers/models/sam2_video/modular_sam2_video.py +31 -18
  527. transformers/models/sam3/configuration_sam3.py +21 -1
  528. transformers/models/sam3/modeling_sam3.py +23 -0
  529. transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
  530. transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
  531. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  532. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
  533. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  534. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  535. transformers/models/sam3_video/modeling_sam3_video.py +3 -3
  536. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  537. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  538. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  539. transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
  540. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
  541. transformers/models/seed_oss/modeling_seed_oss.py +1 -1
  542. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  543. transformers/models/segformer/modeling_segformer.py +2 -2
  544. transformers/models/segformer/modular_segformer.py +0 -1
  545. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  546. transformers/models/siglip/modeling_siglip.py +24 -2
  547. transformers/models/siglip2/modeling_siglip2.py +63 -41
  548. transformers/models/smollm3/modeling_smollm3.py +1 -1
  549. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  550. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  551. transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
  552. transformers/models/speecht5/modeling_speecht5.py +28 -0
  553. transformers/models/splinter/modeling_splinter.py +9 -3
  554. transformers/models/squeezebert/modeling_squeezebert.py +2 -0
  555. transformers/models/stablelm/modeling_stablelm.py +1 -1
  556. transformers/models/starcoder2/modeling_starcoder2.py +1 -1
  557. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  558. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  559. transformers/models/swiftformer/modeling_swiftformer.py +4 -0
  560. transformers/models/swin/modeling_swin.py +16 -12
  561. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  562. transformers/models/swin2sr/modeling_swin2sr.py +49 -33
  563. transformers/models/swinv2/modeling_swinv2.py +41 -33
  564. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  565. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  566. transformers/models/t5/configuration_t5.py +7 -1
  567. transformers/models/t5/modeling_t5.py +1 -7
  568. transformers/models/t5gemma/modeling_t5gemma.py +1 -1
  569. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  570. transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
  571. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  572. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  573. transformers/models/table_transformer/modeling_table_transformer.py +1 -1
  574. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  575. transformers/models/timesfm/modeling_timesfm.py +12 -0
  576. transformers/models/timesfm/modular_timesfm.py +12 -0
  577. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  578. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  579. transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
  580. transformers/models/trocr/modeling_trocr.py +1 -2
  581. transformers/models/tvp/configuration_tvp.py +5 -1
  582. transformers/models/tvp/modeling_tvp.py +4 -4
  583. transformers/models/udop/configuration_udop.py +1 -0
  584. transformers/models/udop/modeling_udop.py +3 -7
  585. transformers/models/umt5/configuration_umt5.py +2 -2
  586. transformers/models/umt5/modeling_umt5.py +0 -6
  587. transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
  588. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  589. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  590. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  591. transformers/models/video_llava/modeling_video_llava.py +7 -3
  592. transformers/models/vilt/configuration_vilt.py +2 -2
  593. transformers/models/vilt/modeling_vilt.py +7 -0
  594. transformers/models/vipllava/modeling_vipllava.py +7 -3
  595. transformers/models/visual_bert/modeling_visual_bert.py +2 -0
  596. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  597. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  598. transformers/models/vitmatte/modeling_vitmatte.py +4 -0
  599. transformers/models/vitpose/configuration_vitpose.py +1 -1
  600. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  601. transformers/models/voxtral/modeling_voxtral.py +2 -2
  602. transformers/models/voxtral/modular_voxtral.py +2 -2
  603. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
  604. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
  605. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
  606. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  607. transformers/models/whisper/generation_whisper.py +1 -0
  608. transformers/models/whisper/modeling_whisper.py +5 -3
  609. transformers/models/x_clip/modeling_x_clip.py +2 -0
  610. transformers/models/xcodec/modeling_xcodec.py +5 -0
  611. transformers/models/xglm/modeling_xglm.py +10 -0
  612. transformers/models/xlm/modeling_xlm.py +13 -14
  613. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  614. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  615. transformers/models/xlnet/modeling_xlnet.py +3 -1
  616. transformers/models/xmod/modeling_xmod.py +3 -0
  617. transformers/models/yoso/modeling_yoso.py +4 -1
  618. transformers/models/zamba/modeling_zamba.py +2 -1
  619. transformers/models/zamba2/modeling_zamba2.py +3 -2
  620. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  621. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  622. transformers/models/zoedepth/modeling_zoedepth.py +7 -0
  623. transformers/pipelines/__init__.py +9 -6
  624. transformers/pipelines/automatic_speech_recognition.py +20 -12
  625. transformers/pipelines/base.py +1 -1
  626. transformers/pipelines/document_question_answering.py +1 -1
  627. transformers/pipelines/question_answering.py +1 -1
  628. transformers/pipelines/text_to_audio.py +2 -2
  629. transformers/processing_utils.py +127 -56
  630. transformers/quantizers/auto.py +2 -4
  631. transformers/quantizers/base.py +9 -64
  632. transformers/quantizers/quantizer_aqlm.py +1 -18
  633. transformers/quantizers/quantizer_auto_round.py +1 -10
  634. transformers/quantizers/quantizer_awq.py +3 -8
  635. transformers/quantizers/quantizer_bitnet.py +1 -6
  636. transformers/quantizers/quantizer_bnb_4bit.py +9 -49
  637. transformers/quantizers/quantizer_bnb_8bit.py +9 -19
  638. transformers/quantizers/quantizer_compressed_tensors.py +1 -4
  639. transformers/quantizers/quantizer_eetq.py +2 -12
  640. transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
  641. transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
  642. transformers/quantizers/quantizer_fp_quant.py +4 -4
  643. transformers/quantizers/quantizer_gptq.py +1 -4
  644. transformers/quantizers/quantizer_higgs.py +2 -6
  645. transformers/quantizers/quantizer_mxfp4.py +2 -28
  646. transformers/quantizers/quantizer_quanto.py +14 -14
  647. transformers/quantizers/quantizer_spqr.py +3 -8
  648. transformers/quantizers/quantizer_torchao.py +28 -124
  649. transformers/quantizers/quantizer_vptq.py +1 -10
  650. transformers/testing_utils.py +28 -12
  651. transformers/tokenization_mistral_common.py +3 -2
  652. transformers/tokenization_utils_base.py +3 -2
  653. transformers/tokenization_utils_tokenizers.py +25 -2
  654. transformers/trainer.py +24 -2
  655. transformers/trainer_callback.py +8 -0
  656. transformers/trainer_seq2seq.py +4 -0
  657. transformers/training_args.py +8 -10
  658. transformers/utils/__init__.py +4 -0
  659. transformers/utils/attention_visualizer.py +4 -4
  660. transformers/utils/auto_docstring.py +34 -25
  661. transformers/utils/generic.py +20 -0
  662. transformers/utils/import_utils.py +51 -9
  663. transformers/utils/kernel_config.py +71 -18
  664. transformers/utils/quantization_config.py +8 -8
  665. transformers/video_processing_utils.py +16 -12
  666. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
  667. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
  668. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
  669. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  670. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
  671. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,7 @@ from typing import TYPE_CHECKING
18
18
  from packaging import version
19
19
 
20
20
  from .base import HfQuantizer
21
- from .quantizers_utils import get_module_from_name
21
+ from .quantizers_utils import get_module_from_name, should_convert_module
22
22
 
23
23
 
24
24
  if TYPE_CHECKING:
@@ -94,19 +94,19 @@ class TorchAoHfQuantizer(HfQuantizer):
94
94
  def __init__(self, quantization_config, **kwargs):
95
95
  super().__init__(quantization_config, **kwargs)
96
96
 
97
- if isinstance(self.quantization_config.quant_type, str):
98
- is_int_4 = "int4" in self.quantization_config.quant_type
99
- else:
100
- config_name = self.quantization_config.quant_type.__class__.__name__
101
- is_int_4 = fuzzy_match_size(config_name) == "4"
102
-
103
- # TODO: better way to get the serialized key names? Hard to read from torchao codebase
104
- if is_int_4:
105
- self.weight_ao_keys = ["qdata", "scale", "zero_point"]
97
+ self.quantized_param_size = None
98
+ quant_type = self.quantization_config.quant_type
99
+ if isinstance(quant_type, str):
100
+ map_to_param_size = {
101
+ "int4_weight_only": 0.5,
102
+ "int8_weight_only": 1,
103
+ "int8_dynamic_activation_int8_weight": 1,
104
+ }
105
+ if quant_type in map_to_param_size:
106
+ self.quantized_param_size = map_to_param_size[quant_type]
106
107
  else:
107
- self.weight_ao_keys = ["qdata", "scale"]
108
- # Instead of serializing the simple torch.Tensor like usual, torchao adds a `:_data` suffix so we need this
109
- self.full_ao_keys = self.weight_ao_keys + ["_data"]
108
+ size_digit = fuzzy_match_size(quant_type.__class__.__name__)
109
+ self.quantized_param_size = 0.5 if size_digit == "4" else 1
110
110
 
111
111
  def validate_environment(self, *args, **kwargs):
112
112
  if not is_torchao_available():
@@ -134,22 +134,11 @@ class TorchAoHfQuantizer(HfQuantizer):
134
134
 
135
135
  def update_dtype(self, dtype):
136
136
  if self.quantization_config.quant_type == "int4_weight_only":
137
- if dtype is not None and dtype != torch.bfloat16:
137
+ if dtype != torch.bfloat16:
138
138
  logger.warning_once(
139
- f"Setting dtype to {dtype} for int4_weight_only quantization, but only bfloat16 is supported right now. Please set the dtype to bfloat16."
140
- )
141
- if dtype is None:
142
- logger.warning_once(
143
- "Setting dtype to torch.bfloat16 for int4_weight_only quantization since only bfloat16 is supported right now. Please set dtype=torch.bfloat16 to remove this warning."
139
+ f"Setting dtype to {dtype} for int4_weight_only quantization, but only bfloat16 is supported right now. Overwriting torch_dtype to bfloat16."
144
140
  )
145
141
  dtype = torch.bfloat16
146
- if self.quantization_config.quant_type == "int8_dynamic_activation_int8_weight":
147
- if dtype is None:
148
- logger.info(
149
- "Setting dtype to torch.float32 for int8_dynamic_activation_int8_weight quantization as no dtype was specified in from_pretrained"
150
- )
151
- # we need to set the dtype, otherwise we have dtype mismatch when performing the quantized linear op
152
- dtype = torch.float32
153
142
  return dtype
154
143
 
155
144
  def get_state_dict_and_metadata(self, model):
@@ -157,57 +146,27 @@ class TorchAoHfQuantizer(HfQuantizer):
157
146
  We flatten the state dict of tensor subclasses so that it is compatible with the safetensors format.
158
147
  """
159
148
  if TORCHAO_VERSION >= version.parse("0.15.0"):
160
- return flatten_tensor_state_dict(model.state_dict()), {}
149
+ return flatten_tensor_state_dict(model.state_dict())
161
150
  else:
162
151
  raise RuntimeError(
163
152
  f"In order to use safetensors with torchao, please use torchao version >= 0.15.0. Current version: {TORCHAO_VERSION}"
164
153
  )
165
154
 
166
- def adjust_target_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
167
- from accelerate.utils import CustomDtype
168
-
169
- # Import AOBaseConfig directly since we know we have the right version
170
- if self.quantization_config._get_ao_version() > version.Version("0.9.0"):
171
- from torchao.core.config import AOBaseConfig
172
-
173
- quant_type = self.quantization_config.quant_type
174
- if isinstance(quant_type, AOBaseConfig):
175
- # Extract size digit using fuzzy match on the class name
176
- config_name = quant_type.__class__.__name__
177
- size_digit = fuzzy_match_size(config_name)
178
-
179
- # Map the extracted digit to appropriate dtype
180
- if size_digit == "4":
181
- return CustomDtype.INT4
182
- else:
183
- # Default to int8
184
- return torch.int8
185
-
186
- # Original mapping for non-AOBaseConfig types
187
- map_to_target_dtype = {
188
- "int4_weight_only": CustomDtype.INT4,
189
- "int8_weight_only": torch.int8,
190
- "int8_dynamic_activation_int8_weight": torch.int8,
191
- "autoquant": None,
192
- }
193
- return map_to_target_dtype[self.quantization_config.quant_type]
194
- else:
195
- raise ValueError(
196
- "You are using `device_map='auto'` on a torchao quantized model. To automatically compute"
197
- " the appropriate device map, you should upgrade your `accelerate` library with "
198
- "`pip install --upgrade accelerate`"
199
- )
155
+ def param_element_size(self, model: "PreTrainedModel", param_name: str, param: "torch.Tensor") -> float:
156
+ "Return the element size (in bytes) for `param_name`."
157
+ if self.param_needs_quantization(model, param_name) and self.quantized_param_size is not None:
158
+ return self.quantized_param_size
159
+
160
+ return super().param_element_size(model, param_name, param)
200
161
 
201
162
  def adjust_max_memory(self, max_memory: dict[str, int | str]) -> dict[str, int | str]:
202
163
  # need more space for the quantization parameters (e.g. scale). Tested with int4 wo and group size = 128
203
164
  max_memory = {key: val * 0.9 for key, val in max_memory.items()}
204
165
  return max_memory
205
166
 
206
- def _process_model_before_weight_loading(
207
- self, model: "PreTrainedModel", keep_in_fp32_modules: list[str] | None = None, **kwargs
208
- ):
167
+ def _process_model_before_weight_loading(self, model: "PreTrainedModel", checkpoint_files=None, **kwargs):
209
168
  self.modules_to_not_convert = self.get_modules_to_not_convert(
210
- model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
169
+ model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
211
170
  )
212
171
  if self.quantization_config.include_input_output_embeddings:
213
172
  input_emb = model.get_input_embeddings()
@@ -217,16 +176,16 @@ class TorchAoHfQuantizer(HfQuantizer):
217
176
  self.modules_to_not_convert = [
218
177
  x for x in self.modules_to_not_convert if x not in input_emb_names + output_emb_names
219
178
  ]
220
- return
179
+ if checkpoint_files is not None:
180
+ # Torchao needs access to all metadata later
181
+ self.set_metadata(checkpoint_files)
221
182
 
222
183
  def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
223
- if self.pre_quantized:
224
- return False
225
184
  if self.quantization_config.quant_type == "autoquant":
226
185
  return False
227
186
 
228
187
  # check if the param_name is not in self.modules_to_not_convert
229
- if any(key + "." in param_name or key == param_name for key in self.modules_to_not_convert):
188
+ if not should_convert_module(param_name, self.modules_to_not_convert):
230
189
  return False
231
190
 
232
191
  # we only quantize the weight of nn.Linear and nn.Embedding
@@ -253,22 +212,6 @@ class TorchAoHfQuantizer(HfQuantizer):
253
212
 
254
213
  return isinstance(module, tuple(_QUANTIZABLE)) and tensor_name == "weight"
255
214
 
256
- def preprocess_model(self, model: "PreTrainedModel", config, dtype=None, checkpoint_files=None, **kwargs):
257
- """
258
- Setting model attributes and/or converting model before weights loading. At this point
259
- the model should be initialized on the meta device so you can freely manipulate the skeleton
260
- of the model in order to replace modules in-place. Make sure to override the abstract method `_process_model_before_weight_loading`.
261
-
262
- Args:
263
- model (`~transformers.PreTrainedModel`):
264
- The model to quantize
265
- kwargs (`dict`, *optional*):
266
- The keyword arguments that are passed along `_process_model_before_weight_loading`.
267
- """
268
- super().preprocess_model(model, config, dtype, checkpoint_files, **kwargs)
269
- # Torchao needs access to all metadata later
270
- self.set_metadata(checkpoint_files)
271
-
272
215
  def _process_model_after_weight_loading(self, model, **kwargs):
273
216
  """No process required for torchao quantized model"""
274
217
  if self.quantization_config.quant_type == "autoquant":
@@ -294,45 +237,6 @@ class TorchAoHfQuantizer(HfQuantizer):
294
237
  )
295
238
  return _is_torchao_serializable
296
239
 
297
- def get_accelerator_warm_up_factor(self):
298
- """
299
- This factor is used in caching_allocator_warmup to determine how many bytes to pre-allocate for accelerator warmup.
300
- - A factor of 2 means we pre-allocate the full memory footprint of the model.
301
- - A factor of 4 means we pre-allocate half of that, and so on
302
-
303
- However, when using TorchAO, calculating memory usage with param.numel() * param.element_size() doesn't give the correct size for quantized weights (like int4 or int8)
304
- That's because TorchAO internally represents quantized tensors using subtensors and metadata, and the reported element_size() still corresponds to the dtype
305
- not the actual bit-width of the quantized data.
306
-
307
- To correct for this:
308
- - Use a division factor of 8 for int4 weights
309
- - Use a division factor of 4 for int8 weights
310
- """
311
- if self.quantization_config._get_ao_version() > version.Version("0.9.0"):
312
- from torchao.core.config import AOBaseConfig
313
-
314
- quant_type = self.quantization_config.quant_type
315
- # For autoquant case, it will be treated in the string implementation below in map_to_target_dtype
316
- if isinstance(quant_type, AOBaseConfig):
317
- # Extract size digit using fuzzy match on the class name
318
- config_name = quant_type.__class__.__name__
319
- size_digit = fuzzy_match_size(config_name)
320
-
321
- if size_digit == "4":
322
- return 8
323
- else:
324
- return 4
325
-
326
- # Original mapping for non-AOBaseConfig types
327
- map_to_target_dtype = {
328
- "int4_weight_only": 8,
329
- "int8_weight_only": 4,
330
- "int8_dynamic_activation_int8_weight": 4,
331
- "autoquant": 4,
332
- }
333
-
334
- return map_to_target_dtype[self.quantization_config.quant_type]
335
-
336
240
  @property
337
241
  def is_trainable(self) -> bool:
338
242
  supported_quant_types_for_training = [
@@ -49,24 +49,15 @@ class VptqHfQuantizer(HfQuantizer):
49
49
  if not torch.cuda.is_available():
50
50
  raise RuntimeError("GPU is required to run VTPQ quantized model.")
51
51
 
52
- def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
53
- if dtype is None:
54
- dtype = torch.float16
55
- logger.info(
56
- "Assuming VPTQ inference on GPU and loading the model in `torch.float16`. To overwrite it, set `dtype` manually."
57
- )
58
- return dtype
59
-
60
52
  def _process_model_before_weight_loading(
61
53
  self,
62
54
  model: "PreTrainedModel",
63
- keep_in_fp32_modules: list[str] | None = None,
64
55
  **kwargs,
65
56
  ):
66
57
  from ..integrations import replace_with_vptq_linear
67
58
 
68
59
  self.modules_to_not_convert = self.get_modules_to_not_convert(
69
- model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
60
+ model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
70
61
  )
71
62
  replace_with_vptq_linear(
72
63
  model,
@@ -118,6 +118,7 @@ from .utils import (
118
118
  is_mistral_common_available,
119
119
  is_natten_available,
120
120
  is_nltk_available,
121
+ is_numba_available,
121
122
  is_onnx_available,
122
123
  is_openai_available,
123
124
  is_optimum_available,
@@ -130,6 +131,7 @@ from .utils import (
130
131
  is_pyctcdecode_available,
131
132
  is_pytesseract_available,
132
133
  is_pytest_available,
134
+ is_pytest_order_available,
133
135
  is_pytorch_quantization_available,
134
136
  is_quark_available,
135
137
  is_qutlass_available,
@@ -221,7 +223,7 @@ if is_torch_available():
221
223
  import torch
222
224
  from safetensors.torch import load_file
223
225
 
224
- from .modeling_utils import PreTrainedModel
226
+ from .modeling_utils import FLASH_ATTN_KERNEL_FALLBACK, PreTrainedModel
225
227
 
226
228
  IS_ROCM_SYSTEM = torch.version.hip is not None
227
229
  IS_CUDA_SYSTEM = torch.version.cuda is not None
@@ -620,7 +622,7 @@ def require_flash_attn(test_case):
620
622
  try:
621
623
  from kernels import get_kernel
622
624
 
623
- get_kernel("kernels-community/flash-attn2")
625
+ get_kernel(FLASH_ATTN_KERNEL_FALLBACK["flash_attention_2"])
624
626
  except Exception as _:
625
627
  kernels_available = False
626
628
 
@@ -1091,17 +1093,20 @@ def require_torch_large_gpu(test_case, memory: float = 20):
1091
1093
  )(test_case)
1092
1094
 
1093
1095
 
1094
- def require_torch_large_accelerator(test_case, memory: float = 20):
1096
+ def require_torch_large_accelerator(test_case=None, *, memory: float = 20):
1095
1097
  """Decorator marking a test that requires an accelerator with more than `memory` GiB of memory."""
1096
- if torch_device != "cuda" and torch_device != "xpu":
1097
- return unittest.skip(reason=f"test requires a GPU or XPU with more than {memory} GiB of memory")(test_case)
1098
1098
 
1099
- torch_accelerator_module = getattr(torch, torch_device)
1099
+ def memory_decorator(tc):
1100
+ if torch_device not in ("cuda", "xpu"):
1101
+ return unittest.skip(f"test requires a GPU or XPU with more than {memory} GiB of memory")(tc)
1100
1102
 
1101
- return unittest.skipUnless(
1102
- torch_accelerator_module.get_device_properties(0).total_memory / 1024**3 > memory,
1103
- f"test requires a GPU or XPU with more than {memory} GiB of memory",
1104
- )(test_case)
1103
+ torch_accel = getattr(torch, torch_device)
1104
+ return unittest.skipUnless(
1105
+ torch_accel.get_device_properties(0).total_memory / 1024**3 > memory,
1106
+ f"test requires a GPU or XPU with more than {memory} GiB of memory",
1107
+ )(tc)
1108
+
1109
+ return memory_decorator if test_case is None else memory_decorator(test_case)
1105
1110
 
1106
1111
 
1107
1112
  def require_torch_accelerator(test_case):
@@ -1381,6 +1386,13 @@ def require_pyctcdecode(test_case):
1381
1386
  return unittest.skipUnless(is_pyctcdecode_available(), "test requires pyctcdecode")(test_case)
1382
1387
 
1383
1388
 
1389
+ def require_numba(test_case):
1390
+ """
1391
+ Decorator marking a test that requires numba
1392
+ """
1393
+ return unittest.skipUnless(is_numba_available(), "test requires numba")(test_case)
1394
+
1395
+
1384
1396
  def require_librosa(test_case):
1385
1397
  """
1386
1398
  Decorator marking a test that requires librosa
@@ -2659,9 +2671,13 @@ def run_first(test_case):
2659
2671
  single process at a time. So we make sure all tests that run in a subprocess are launched first, to avoid device
2660
2672
  allocation conflicts.
2661
2673
  """
2662
- import pytest
2674
+ # Without this check, we get unwanted warnings when it's not installed
2675
+ if is_pytest_order_available():
2676
+ import pytest
2663
2677
 
2664
- return pytest.mark.order(1)(test_case)
2678
+ return pytest.mark.order(1)(test_case)
2679
+ else:
2680
+ return test_case
2665
2681
 
2666
2682
 
2667
2683
  def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None):
@@ -1114,7 +1114,7 @@ class MistralCommonBackend(PushToHubMixin):
1114
1114
  max_length = self.model_max_length
1115
1115
 
1116
1116
  # Test if we have a padding token
1117
- if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token is None or self.pad_token_id < 0):
1117
+ if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token_id is None or self.pad_token_id < 0):
1118
1118
  raise ValueError(
1119
1119
  "Asking to pad but the tokenizer does not have a padding token. "
1120
1120
  "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
@@ -1851,8 +1851,9 @@ class MistralCommonBackend(PushToHubMixin):
1851
1851
  raise ValueError("`init_inputs` are not supported by `MistralCommonBackend.from_pretrained`.")
1852
1852
 
1853
1853
  # Handle kwargs and AutoTokenizer/AutoProcessor case
1854
+ # These kwargs are passed by AutoTokenizer/AutoProcessor but are not used by MistralCommonBackend
1854
1855
  if kwargs and not set(kwargs.keys()).issubset(
1855
- {"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "_from_auto"}
1856
+ {"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "_from_auto", "subfolder"}
1856
1857
  ):
1857
1858
  raise ValueError(f"Some kwargs in {kwargs} are not supported by `MistralCommonBackend.from_pretrained`.")
1858
1859
 
@@ -972,7 +972,7 @@ class PreTrainedTokenizerBase(PushToHubMixin):
972
972
 
973
973
  # first name has to correspond to main model input name
974
974
  # to make sure `tokenizer.pad(...)` works correctly
975
- model_input_names: list[str] = ["input_ids", "token_type_ids", "attention_mask"]
975
+ model_input_names: list[str] = ["input_ids", "attention_mask"]
976
976
  padding_side: str = "right"
977
977
  truncation_side: str = "right"
978
978
  slow_tokenizer_class = None
@@ -2152,9 +2152,10 @@ class PreTrainedTokenizerBase(PushToHubMixin):
2152
2152
  # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
2153
2153
  tokenizer_class = self.__class__.__name__
2154
2154
 
2155
- # tokenizers backend don't need to save added_tokens_decoder
2155
+ # tokenizers backend don't need to save added_tokens_decoder and additional_special_tokens
2156
2156
  if any(base.__name__ == "TokenizersBackend" for base in self.__class__.__mro__):
2157
2157
  tokenizer_config.pop("added_tokens_decoder", None)
2158
+ tokenizer_config.pop("additional_special_tokens", None)
2158
2159
 
2159
2160
  # Remove the Fast at the end if we can save the slow tokenizer
2160
2161
  if tokenizer_class.endswith("Fast") and getattr(self, "can_save_slow_tokenizer", False):
@@ -30,6 +30,7 @@ from tokenizers import AddedToken, processors
30
30
  from tokenizers import Encoding as EncodingFast
31
31
  from tokenizers import Tokenizer as TokenizerFast
32
32
  from tokenizers.decoders import Decoder as DecoderFast
33
+ from tokenizers.models import BPE, Unigram
33
34
  from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
34
35
 
35
36
  from .integrations.ggml import convert_gguf_tokenizer
@@ -121,7 +122,8 @@ class TokenizersBackend(PreTrainedTokenizerBase):
121
122
  if isinstance(vocab, list):
122
123
  vocab = list(map(tuple, vocab)) # TODO just for now
123
124
  elif cls.model.__name__ == "Unigram":
124
- vocab = list(map(tuple, vocab))
125
+ if vocab and isinstance(vocab[0], (list, tuple)):
126
+ vocab = [tuple(item) for item in vocab]
125
127
  elif cls.model.__name__ == "WordLevel":
126
128
  vocab = {token: i for i, token in enumerate(vocab)}
127
129
  elif cls.model.__name__ == "BPE" or cls.model.__name__ == "WordPiece":
@@ -182,6 +184,7 @@ class TokenizersBackend(PreTrainedTokenizerBase):
182
184
  local_kwargs["vocab"], local_kwargs["merges"] = TikTokenConverter(
183
185
  vocab_file=vocab_file, extra_special_tokens=local_kwargs.get("extra_special_tokens")
184
186
  ).extract_vocab_merges_from_model(vocab_file)
187
+
185
188
  return local_kwargs
186
189
 
187
190
  # Fallback to standard vocab/merges files if they existed!
@@ -236,6 +239,9 @@ class TokenizersBackend(PreTrainedTokenizerBase):
236
239
  add_prefix_space = kwargs.get("add_prefix_space", False)
237
240
  vocab_file = kwargs.get("vocab_file")
238
241
 
242
+ vocab = kwargs.get("vocab")
243
+ merges = kwargs.get("merges")
244
+
239
245
  fast_tokenizer = None
240
246
  if tokenizer_object is not None:
241
247
  fast_tokenizer = copy.deepcopy(tokenizer_object)
@@ -252,6 +258,15 @@ class TokenizersBackend(PreTrainedTokenizerBase):
252
258
  kwargs.update(tokenizer_config)
253
259
  if len(additional_kwargs) > 0:
254
260
  kwargs.update(additional_kwargs)
261
+ elif self._tokenizer is None and vocab is not None:
262
+ # Build from vocab/merges extracted by convert_to_native_format
263
+ if merges is not None:
264
+ vocab_dict = vocab if isinstance(vocab, dict) else {w: i for i, (w, _) in enumerate(vocab)}
265
+ fast_tokenizer = TokenizerFast(BPE(vocab=vocab_dict, merges=merges, fuse_unk=True, dropout=None))
266
+ elif isinstance(vocab, dict):
267
+ fast_tokenizer = TokenizerFast(BPE(vocab=vocab, merges=[], fuse_unk=True, dropout=None))
268
+ elif isinstance(vocab, list) and vocab and isinstance(vocab[0], (tuple, list)):
269
+ fast_tokenizer = TokenizerFast(Unigram(vocab=vocab, unk_id=kwargs.get("unk_id", 0)))
255
270
  elif self._tokenizer is None:
256
271
  raise ValueError(
257
272
  "Couldn't instantiate the backend tokenizer from one of: \n"
@@ -260,6 +275,11 @@ class TokenizersBackend(PreTrainedTokenizerBase):
260
275
  "(3) an equivalent slow tokenizer class to instantiate and convert. \n"
261
276
  "You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one."
262
277
  )
278
+ # Only set defaults when creating TokenizersBackend from scratch
279
+ if fast_tokenizer_file is None and tokenizer_object is None and self._tokenizer is None:
280
+ kwargs.setdefault("bos_token", "<s>")
281
+ kwargs.setdefault("eos_token", "</s>")
282
+
263
283
  if fast_tokenizer is not None:
264
284
  self._tokenizer = fast_tokenizer
265
285
 
@@ -289,6 +309,7 @@ class TokenizersBackend(PreTrainedTokenizerBase):
289
309
  # Set backend to "tokenizers" if not already set
290
310
  if "backend" not in kwargs:
291
311
  kwargs["backend"] = "tokenizers"
312
+
292
313
  explicit_bos_eos_in_kwargs = "add_bos_token" in kwargs or "add_eos_token" in kwargs
293
314
  self._add_bos_token = kwargs.get("add_bos_token", False)
294
315
  self._add_eos_token = kwargs.get("add_eos_token", False)
@@ -339,7 +360,7 @@ class TokenizersBackend(PreTrainedTokenizerBase):
339
360
  tokens.append(token)
340
361
  if tokens:
341
362
  # These tokens are from the special tokens map
342
- self.add_tokens(tokens, special_tokens=True)
363
+ self.add_tokens(tokens)
343
364
 
344
365
  try:
345
366
  vocab_size = self._tokenizer.get_vocab_size()
@@ -900,6 +921,8 @@ class TokenizersBackend(PreTrainedTokenizerBase):
900
921
 
901
922
  if isinstance(token_ids, int):
902
923
  token_ids = [token_ids]
924
+ if isinstance(token_ids, dict):
925
+ token_ids = token_ids["input_ids"]
903
926
  return self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
904
927
 
905
928
  def _save_pretrained(
transformers/trainer.py CHANGED
@@ -1671,6 +1671,12 @@ class Trainer:
1671
1671
  optimizer_cls = AdamW8bit
1672
1672
  else:
1673
1673
  raise ValueError("Invalid optimizer")
1674
+ optimizer_kwargs.update(
1675
+ {
1676
+ "block_size": optim_args.get("block_size", 256),
1677
+ "bf16_stochastic_round": strtobool(optim_args.get("bf16_stochastic_round", "False")),
1678
+ }
1679
+ )
1674
1680
  optimizer_kwargs.update(adam_kwargs)
1675
1681
  elif args.optim in [
1676
1682
  OptimizerNames.SCHEDULE_FREE_RADAM,
@@ -2349,7 +2355,8 @@ class Trainer:
2349
2355
  if self.is_fsdp_enabled:
2350
2356
  self.model = self.model_wrapped = model
2351
2357
  # Fix `got mixed torch.Tensor and DTensor` error in model.generate() for FSDP2 with LoRA
2352
- dist.fsdp.register_fsdp_forward_method(self.model, "generate")
2358
+ if hasattr(self.model, "generate"):
2359
+ dist.fsdp.register_fsdp_forward_method(self.model, "generate")
2353
2360
 
2354
2361
  # for the rest of this function `model` is the outside model, whether it was wrapped or not
2355
2362
  if model is not self.model:
@@ -3943,6 +3950,9 @@ class Trainer:
3943
3950
  # Both standard transformer models and Liger-patched models handle shift_labels correctly,
3944
3951
  # so we can directly use the computed loss from the model output.
3945
3952
  # See: https://huggingface.co/docs/accelerate/en/concept_guides/sequence_parallelism
3953
+ if "labels" not in inputs and "shift_labels" in inputs:
3954
+ # DeepSpeed SP Dataloader removes "labels" but we need it, otherwise, we won't compute the loss.
3955
+ inputs["labels"] = inputs["shift_labels"]
3946
3956
  outputs = model(**inputs)
3947
3957
  loss = outputs.loss
3948
3958
 
@@ -4018,7 +4028,16 @@ class Trainer:
4018
4028
  self._save(output_dir, state_dict=state_dict)
4019
4029
  elif self.is_deepspeed_enabled:
4020
4030
  try:
4021
- state_dict = self.accelerator.get_state_dict(self.deepspeed)
4031
+ accept_exclude_frozen_parameters = "exclude_frozen_parameters" in set(
4032
+ inspect.signature(self.model_wrapped.save_checkpoint).parameters.keys()
4033
+ )
4034
+ zero3_sharding = self.deepspeed.config.get("zero_optimization", {}).get("stage", None) == 3
4035
+ if accept_exclude_frozen_parameters and _is_peft_model(self.model) and zero3_sharding:
4036
+ # When using PEFT with DeepSpeed ZeRO Stage 3,
4037
+ # we do not need to load the frozen parameters
4038
+ state_dict = self.deepspeed._zero3_consolidated_16bit_state_dict(exclude_frozen_parameters=True)
4039
+ else:
4040
+ state_dict = self.accelerator.get_state_dict(self.deepspeed)
4022
4041
  if self.args.should_save:
4023
4042
  self._save(output_dir, state_dict=state_dict)
4024
4043
  except ValueError:
@@ -4824,6 +4843,7 @@ class Trainer:
4824
4843
  if not self.args.hub_always_push and self.push_in_progress is not None and not self.push_in_progress.is_done():
4825
4844
  return
4826
4845
 
4846
+ self.callback_handler.on_push_begin(self.args, self.state, self.control)
4827
4847
  output_dir = self.args.output_dir
4828
4848
  # To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder
4829
4849
  modeling_files = [CONFIG_NAME, GENERATION_CONFIG_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_NAME]
@@ -4918,6 +4938,8 @@ class Trainer:
4918
4938
  The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the
4919
4939
  progress of the commit if `blocking=True`.
4920
4940
  """
4941
+ self.callback_handler.on_push_begin(self.args, self.state, self.control)
4942
+
4921
4943
  model_name = kwargs.pop("model_name", None)
4922
4944
  if model_name is None and self.args.should_save:
4923
4945
  if self.args.hub_model_id is None:
@@ -420,6 +420,11 @@ class TrainerCallback:
420
420
  Event called after a prediction step.
421
421
  """
422
422
 
423
+ def on_push_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
424
+ """
425
+ Event called before pushing the model to the hub, at the beginning of Trainer.push_to_hub and Trainer._push_from_checkpoint.
426
+ """
427
+
423
428
 
424
429
  class CallbackHandler(TrainerCallback):
425
430
  """Internal class that just calls the list of callbacks in order."""
@@ -532,6 +537,9 @@ class CallbackHandler(TrainerCallback):
532
537
  def on_prediction_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
533
538
  return self.call_event("on_prediction_step", args, state, control)
534
539
 
540
+ def on_push_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
541
+ return self.call_event("on_push_begin", args, state, control, **kwargs)
542
+
535
543
  def call_event(self, event, args, state, control, **kwargs):
536
544
  for callback in self.callbacks:
537
545
  result = getattr(callback, event)(
@@ -333,7 +333,11 @@ class Seq2SeqTrainer(Trainer):
333
333
  self.model.generation_config._from_model_config = False
334
334
 
335
335
  # Retrieves GenerationConfig from model.generation_config
336
+ # Update with defaults because earlier the generation config used ot be init
337
+ # with default values. Now we init it with `None` and keep defaults for BC
336
338
  gen_config = self.model.generation_config
339
+ default_gen_config = gen_config._get_default_generation_params()
340
+ gen_config.update(**default_gen_config, defaults_only=True)
337
341
  # in case the batch is shorter than max length, the output should be padded
338
342
  if generated_tokens.shape[-1] < gen_config.max_length:
339
343
  generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length)
@@ -1530,16 +1530,14 @@ class TrainingArguments:
1530
1530
  self.greater_is_better = not self.metric_for_best_model.endswith("loss")
1531
1531
  if is_torch_available():
1532
1532
  if self.bf16 or self.bf16_full_eval:
1533
- if self.use_cpu and not is_torch_xla_available():
1534
- # cpu
1535
- raise ValueError("Your setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10")
1536
- elif not self.use_cpu:
1537
- if not is_torch_bf16_gpu_available() and not is_torch_xla_available(): # added for tpu support
1538
- error_message = "Your setup doesn't support bf16/gpu."
1539
- if is_torch_cuda_available():
1540
- error_message += " You need Ampere+ GPU with cuda>=11.0"
1541
- # gpu
1542
- raise ValueError(error_message)
1533
+ if (
1534
+ not self.use_cpu and not is_torch_bf16_gpu_available() and not is_torch_xla_available()
1535
+ ): # added for tpu support
1536
+ error_message = "Your setup doesn't support bf16/gpu. You need to assign use_cpu if you want to train the model on CPU"
1537
+ if is_torch_cuda_available():
1538
+ error_message += " You need Ampere+ GPU with cuda>=11.0"
1539
+ # gpu
1540
+ raise ValueError(error_message)
1543
1541
 
1544
1542
  if self.fp16 and self.bf16:
1545
1543
  raise ValueError("At most one of fp16 and bf16 can be True, but not both")
@@ -49,6 +49,7 @@ from .generic import (
49
49
  PaddingStrategy,
50
50
  TensorType,
51
51
  TransformersKwargs,
52
+ _is_tensor_or_array_like,
52
53
  can_return_loss,
53
54
  can_return_tuple,
54
55
  expand_dims,
@@ -144,6 +145,7 @@ from .import_utils import (
144
145
  is_gguf_available,
145
146
  is_gptqmodel_available,
146
147
  is_grokadamw_available,
148
+ is_grouped_mm_available,
147
149
  is_habana_gaudi1,
148
150
  is_hadamard_available,
149
151
  is_hqq_available,
@@ -168,6 +170,7 @@ from .import_utils import (
168
170
  is_ninja_available,
169
171
  is_nltk_available,
170
172
  is_num2words_available,
173
+ is_numba_available,
171
174
  is_onnx_available,
172
175
  is_openai_available,
173
176
  is_optimum_available,
@@ -182,6 +185,7 @@ from .import_utils import (
182
185
  is_pyctcdecode_available,
183
186
  is_pytesseract_available,
184
187
  is_pytest_available,
188
+ is_pytest_order_available,
185
189
  is_pytorch_quantization_available,
186
190
  is_quanto_greater,
187
191
  is_quark_available,
@@ -21,7 +21,7 @@ from ..models.auto.auto_factory import _get_model_class
21
21
  from ..models.auto.configuration_auto import AutoConfig
22
22
  from ..models.auto.modeling_auto import MODEL_FOR_PRETRAINING_MAPPING, MODEL_MAPPING
23
23
  from ..models.auto.processing_auto import PROCESSOR_MAPPING_NAMES, AutoProcessor
24
- from ..models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES, AutoTokenizer
24
+ from ..models.auto.tokenization_auto import AutoTokenizer
25
25
  from .import_utils import is_torch_available
26
26
 
27
27
 
@@ -199,12 +199,12 @@ class AttentionMaskVisualizer:
199
199
  if "token_type_ids" in inputs: # TODO inspect signature of update causal mask
200
200
  kwargs["token_type_ids"] = inputs["token_type_ids"]
201
201
  tokens = processor.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
202
- elif self.config.model_type in TOKENIZER_MAPPING_NAMES:
202
+ else:
203
203
  tokenizer = AutoTokenizer.from_pretrained(self.repo_id)
204
204
  tokens = tokenizer.tokenize(input_sentence)
205
205
  attention_mask = tokenizer(input_sentence, return_tensors="pt")["attention_mask"]
206
- else:
207
- raise ValueError(f"Model type {model.config.model_type} does not support attention visualization")
206
+ if attention_mask is None:
207
+ raise ValueError(f"Model type {self.config.model_type} does not support attention visualization")
208
208
 
209
209
  model.config._attn_implementation = "eager"
210
210
  model.train()