transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,126 @@
1
+ import os
2
+ import signal
3
+ import threading
4
+ from typing import Optional
5
+
6
+ from .trainer_callback import TrainerCallback
7
+ from .trainer_utils import PREFIX_CHECKPOINT_DIR
8
+ from .utils import logging
9
+
10
+
11
+ logger = logging.get_logger(__name__)
12
+
13
+
14
+ class CheckpointManager:
15
+ def __init__(self, trainer, kill_wait: int = 3):
16
+ """
17
+ Initialize the CheckpointManager for Just-In-Time checkpoint handling.
18
+
19
+ Args:
20
+ trainer: The Trainer instance that will be used to save checkpoints when SIGTERM is received.
21
+ kill_wait (`int`, *optional*, defaults to 3): Grace period to distinguish between SIGTERM and SIGKILL.
22
+ """
23
+ self.trainer = trainer
24
+ self.is_checkpoint_requested = False
25
+ self._original_sigterm_handler = None
26
+ self.kill_wait = kill_wait
27
+
28
+ def setup_signal_handler(self):
29
+ self._original_sigterm_handler = signal.signal(signal.SIGTERM, self._sigterm_handler)
30
+ logger.info("JIT checkpoint signal handler registered for SIGTERM")
31
+
32
+ def _sigterm_handler(self, signum, frame):
33
+ if self.is_checkpoint_requested:
34
+ return
35
+
36
+ logger.info(f"SIGTERM received, will request JIT checkpoint after {self.kill_wait}s")
37
+ threading.Timer(self.kill_wait, self._enable_checkpoint).start()
38
+
39
+ def _enable_checkpoint(self):
40
+ logger.info("Kill wait period elapsed, requesting checkpoint")
41
+ self.is_checkpoint_requested = True
42
+
43
+ def execute_jit_checkpoint(self):
44
+ try:
45
+ # Set checkpoint flag to False to avoid multiple checkpoints getting triggered by other callbacks
46
+ self.is_checkpoint_requested = False
47
+
48
+ logger.info("Starting JIT checkpointing...")
49
+ current_step = self.trainer.state.global_step
50
+ logger.info(f"Saving JIT checkpoint at step {current_step}")
51
+
52
+ output_dir = self.trainer._get_output_dir(trial=None)
53
+ checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{current_step}"
54
+ checkpoint_path = os.path.join(output_dir, checkpoint_folder)
55
+
56
+ # Create checkpoint directory
57
+ os.makedirs(checkpoint_path, exist_ok=True)
58
+
59
+ # Create a sentinel file to indicate checkpointing is in progress
60
+ sentinel_file = os.path.join(output_dir, checkpoint_folder, "checkpoint-is-incomplete.txt")
61
+ with open(sentinel_file, "w") as f:
62
+ f.write(f"Checkpoint started at step {current_step} and in progress...")
63
+ logger.info(f"Created checkpoint progress sentinel marker file: {sentinel_file}")
64
+
65
+ # Invoke the trainer's checkpoint method directly
66
+ self.trainer._save_checkpoint(self.trainer.model, trial=None)
67
+
68
+ # Remove sentinel file upon successful checkpointing
69
+ if os.path.exists(sentinel_file):
70
+ os.remove(sentinel_file)
71
+ logger.info("Sentinel marker file removed")
72
+
73
+ logger.info("Immediate JIT checkpoint completed successfully")
74
+
75
+ except Exception as e:
76
+ logger.error(f"Failed to save JIT checkpoint: {e}")
77
+ raise
78
+
79
+
80
+ class JITCheckpointCallback(TrainerCallback):
81
+ """
82
+ Callback for Just-In-Time checkpointing on SIGTERM signals.
83
+
84
+ When SIGTERM is received, the checkpoint manager sets `is_checkpoint_requested=True`.
85
+ The callbacks detect this flag and set `control.should_training_stop=True`, which signals
86
+ the Trainer's training loop to exit gracefully after saving the checkpoint.
87
+ """
88
+
89
+ def __init__(self):
90
+ self.trainer = None
91
+ self.jit_manager: Optional[CheckpointManager] = None
92
+
93
+ def set_trainer(self, trainer):
94
+ self.trainer = trainer
95
+ if trainer.args.enable_jit_checkpoint:
96
+ self.jit_manager = CheckpointManager(trainer=trainer)
97
+ self.jit_manager.setup_signal_handler()
98
+ logger.info("JIT checkpointing enabled")
99
+
100
+ def on_pre_optimizer_step(self, args, state, control, **kwargs):
101
+ if self.jit_manager and self.jit_manager.is_checkpoint_requested:
102
+ control.should_training_stop = True
103
+ self.jit_manager.execute_jit_checkpoint()
104
+
105
+ def on_step_begin(self, args, state, control, **kwargs):
106
+ if self.jit_manager and self.jit_manager.is_checkpoint_requested:
107
+ control.should_training_stop = True
108
+ self.jit_manager.execute_jit_checkpoint()
109
+
110
+ def on_step_end(self, args, state, control, **kwargs):
111
+ if self.jit_manager and self.jit_manager.is_checkpoint_requested:
112
+ control.should_save = False
113
+ control.should_training_stop = True
114
+ self.jit_manager.execute_jit_checkpoint()
115
+
116
+ def on_epoch_end(self, args, state, control, **kwargs):
117
+ if self.jit_manager and self.jit_manager.is_checkpoint_requested:
118
+ control.should_save = False
119
+ control.should_training_stop = True
120
+ self.jit_manager.execute_jit_checkpoint()
121
+
122
+ def on_train_end(self, args, state, control, **kwargs):
123
+ # Restore original SIGTERM handler
124
+ if self.jit_manager and self.jit_manager._original_sigterm_handler is not None:
125
+ signal.signal(signal.SIGTERM, self.jit_manager._original_sigterm_handler)
126
+ logger.info("Restored original SIGTERM handler after training completion")
@@ -333,7 +333,11 @@ class Seq2SeqTrainer(Trainer):
333
333
  self.model.generation_config._from_model_config = False
334
334
 
335
335
  # Retrieves GenerationConfig from model.generation_config
336
+ # Update with defaults because earlier the generation config used ot be init
337
+ # with default values. Now we init it with `None` and keep defaults for BC
336
338
  gen_config = self.model.generation_config
339
+ default_gen_config = gen_config._get_default_generation_params()
340
+ gen_config.update(**default_gen_config, defaults_only=True)
337
341
  # in case the batch is shorter than max length, the output should be padded
338
342
  if generated_tokens.shape[-1] < gen_config.max_length:
339
343
  generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length)
@@ -924,7 +924,7 @@ def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
924
924
  shard_files = list(set(index["weight_map"].values()))
925
925
 
926
926
  # If strict=True, error before loading any of the state dicts.
927
- # TODO: Here, update the weigth map with the config.dynamic_weight_conversion
927
+ # TODO: Here, update the weight map with the config.dynamic_weight_conversion
928
928
  loaded_keys = index["weight_map"].keys()
929
929
  model_keys = model.state_dict().keys()
930
930
  missing_keys = [key for key in model_keys if key not in loaded_keys]
@@ -340,9 +340,17 @@ class TrainingArguments:
340
340
  `save_total_limit=5` and `load_best_model_at_end`, the four last checkpoints will always be retained
341
341
  alongside the best model. When `save_total_limit=1` and `load_best_model_at_end`, it is possible that two
342
342
  checkpoints are saved: the last one and the best one (if they are different).
343
- save_safetensors (`bool`, *optional*, defaults to `True`):
344
- Use [safetensors](https://huggingface.co/docs/safetensors) saving and loading for state dicts instead of
345
- default `torch.load` and `torch.save`.
343
+ enable_jit_checkpoint (`bool`, *optional*, defaults to `False`):
344
+ Whether to enable Just-In-Time (JIT) checkpointing on SIGTERM signal. When enabled, training will
345
+ checkpoint upon receiving SIGTERM, allowing for graceful termination without losing
346
+ progress. This is particularly useful for shared clusters with preemptible workloads (e.g., Kueue).
347
+ **Important**: You must configure your orchestrator's graceful shutdown period to allow sufficient time
348
+ for checkpoint completion. For Kubernetes, set `terminationGracePeriodSeconds` in your job definition
349
+ (method varies by cloud-native trainer: Kubeflow, Ray, etc.). Note: the default is only 30 seconds,
350
+ which is typically insufficient. For Slurm, use `--signal=USR1@<seconds>` in your sbatch script to send
351
+ SIGTERM with adequate time before the job time limit. Calculate the required grace period as: longest
352
+ possible iteration time + checkpoint saving time. For example, if an iteration takes 2 minutes and
353
+ checkpoint saving takes 2 minutes, set at least 4 minutes (240 seconds) of grace time.
346
354
  save_on_each_node (`bool`, *optional*, defaults to `False`):
347
355
  When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on
348
356
  the main one.
@@ -585,9 +593,9 @@ class TrainingArguments:
585
593
  instance of `Dataset`.
586
594
  report_to (`str` or `list[str]`, *optional*, defaults to `"none"`):
587
595
  The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
588
- `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"neptune"`,
589
- `"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all integrations
590
- installed, `"none"` for no integrations.
596
+ `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"swanlab"`,
597
+ `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"`
598
+ for no integrations.
591
599
  project (`str`, *optional*, defaults to `"huggingface"`):
592
600
  The name of the project to use for logging. Currently, only used by Trackio.
593
601
  trackio_space_id (`str` or `None`, *optional*, defaults to `"trackio"`):
@@ -852,7 +860,7 @@ class TrainingArguments:
852
860
  warmup_ratio: float | None = field(
853
861
  default=None,
854
862
  metadata={
855
- "help": "This argument is deprecated and will be removed in v5. Use `warmup_steps` instead as it also works with float values."
863
+ "help": "This argument is deprecated and will be removed in v5.2. Use `warmup_steps` instead as it also works with float values."
856
864
  },
857
865
  )
858
866
 
@@ -929,14 +937,24 @@ class TrainingArguments:
929
937
  " for `save_total_limit=5` and `load_best_model_at_end=True`, the four last checkpoints will always be"
930
938
  " retained alongside the best model. When `save_total_limit=1` and `load_best_model_at_end=True`,"
931
939
  " it is possible that two checkpoints are saved: the last one and the best one (if they are different)."
932
- " Default is unlimited checkpoints"
940
+ " Default is unlimited checkpoints."
933
941
  )
934
942
  },
935
943
  )
936
- save_safetensors: bool = field(
937
- default=True,
944
+ enable_jit_checkpoint: bool = field(
945
+ default=False,
938
946
  metadata={
939
- "help": "Use safetensors saving and loading for state dicts instead of default torch.load and torch.save."
947
+ "help": (
948
+ "Whether to enable Just-In-Time (JIT) checkpointing on SIGTERM signal. "
949
+ "When enabled, training will checkpoint upon receiving SIGTERM, "
950
+ "allowing for graceful termination without losing progress. "
951
+ "This is particularly useful for shared clusters with preemptible workloads (Kueue). "
952
+ "IMPORTANT: You must configure your orchestrator's graceful shutdown period. "
953
+ "Kubernetes: set terminationGracePeriodSeconds (default 30s is insufficient!) in your job definition. "
954
+ "Slurm: use --signal=USR1@<seconds> in sbatch to send SIGTERM before time limit. "
955
+ "Calculate required grace period as: iteration time + checkpoint saving time. "
956
+ "Example: 2min iteration + 2min checkpoint = 240 seconds minimum."
957
+ )
940
958
  },
941
959
  )
942
960
  save_on_each_node: bool = field(
@@ -1504,14 +1522,6 @@ class TrainingArguments:
1504
1522
  f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."
1505
1523
  )
1506
1524
 
1507
- if not self.save_safetensors:
1508
- logger.info(
1509
- f"Found safetensors installation, but --save_safetensors={self.save_safetensors}. "
1510
- f"Safetensors should be a preferred weights saving format due to security and performance reasons. "
1511
- f"If your model cannot be saved by safetensors please feel free to open an issue at "
1512
- f"https://github.com/huggingface/safetensors!"
1513
- )
1514
-
1515
1525
  if (
1516
1526
  self.load_best_model_at_end or self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU
1517
1527
  ) and self.metric_for_best_model is None:
@@ -1520,16 +1530,14 @@ class TrainingArguments:
1520
1530
  self.greater_is_better = not self.metric_for_best_model.endswith("loss")
1521
1531
  if is_torch_available():
1522
1532
  if self.bf16 or self.bf16_full_eval:
1523
- if self.use_cpu and not is_torch_xla_available():
1524
- # cpu
1525
- raise ValueError("Your setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10")
1526
- elif not self.use_cpu:
1527
- if not is_torch_bf16_gpu_available() and not is_torch_xla_available(): # added for tpu support
1528
- error_message = "Your setup doesn't support bf16/gpu."
1529
- if is_torch_cuda_available():
1530
- error_message += " You need Ampere+ GPU with cuda>=11.0"
1531
- # gpu
1532
- raise ValueError(error_message)
1533
+ if (
1534
+ not self.use_cpu and not is_torch_bf16_gpu_available() and not is_torch_xla_available()
1535
+ ): # added for tpu support
1536
+ error_message = "Your setup doesn't support bf16/gpu. You need to assign use_cpu if you want to train the model on CPU"
1537
+ if is_torch_cuda_available():
1538
+ error_message += " You need Ampere+ GPU with cuda>=11.0"
1539
+ # gpu
1540
+ raise ValueError(error_message)
1533
1541
 
1534
1542
  if self.fp16 and self.bf16:
1535
1543
  raise ValueError("At most one of fp16 and bf16 can be True, but not both")
@@ -2359,8 +2367,8 @@ class TrainingArguments:
2359
2367
  report_to (`str` or `list[str]`, *optional*, defaults to `"none"`):
2360
2368
  The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
2361
2369
  `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`,
2362
- `"neptune"`, `"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all
2363
- integrations installed, `"none"` for no integrations.
2370
+ `"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all integrations
2371
+ installed, `"none"` for no integrations.
2364
2372
  first_step (`bool`, *optional*, defaults to `False`):
2365
2373
  Whether to log and evaluate the first `global_step` or not.
2366
2374
  nan_inf_filter (`bool`, *optional*, defaults to `True`):
@@ -2565,7 +2573,7 @@ class TrainingArguments:
2565
2573
  ```
2566
2574
  """
2567
2575
  if warmup_ratio is not None:
2568
- logger.warning("warmup_ratio is deprecated and will be removed in v5. Use `warmup_steps` instead.")
2576
+ logger.warning("warmup_ratio is deprecated and will be removed in v5.2 . Use `warmup_steps` instead.")
2569
2577
  warmup_steps = warmup_ratio
2570
2578
 
2571
2579
  self.lr_scheduler_type = SchedulerType(name)
@@ -2742,10 +2750,24 @@ class TrainingArguments:
2742
2750
  fsdp_plugin_args["transformer_cls_names_to_wrap"] = ",".join(
2743
2751
  self.fsdp_config["transformer_layer_cls_to_wrap"]
2744
2752
  )
2745
- fsdp_plugin_args["fsdp_version"] = self.fsdp_config.get("fsdp_version", 1)
2753
+ fsdp_version = int(self.fsdp_config.get("version", 1))
2754
+ fsdp_plugin_args["fsdp_version"] = fsdp_version
2746
2755
  prefetch_policy = self.fsdp_config.get("backward_prefetch", "NO_PREFETCH")
2747
- fsdp_plugin_args["backward_prefetch"] = prefetch_policy.upper()
2748
- fsdp_plugin_args["forward_prefetch"] = str(self.fsdp_config.get("forward_prefetch", "false")).lower()
2756
+ if fsdp_version == 2:
2757
+ fsdp_plugin_args["reshard_after_forward"] = str_to_bool(
2758
+ str(self.fsdp_config.get("reshard_after_forward", "false")).lower()
2759
+ )
2760
+ else:
2761
+ fsdp_plugin_args["forward_prefetch"] = str_to_bool(
2762
+ str(self.fsdp_config.get("forward_prefetch", "false")).lower()
2763
+ )
2764
+ fsdp_plugin_args["backward_prefetch"] = prefetch_policy.upper()
2765
+ fsdp_plugin_args["reshard_after_forward"] = str(
2766
+ self.fsdp_config.get("reshard_after_forward", "FULL_SHARD")
2767
+ ).lower()
2768
+ fsdp_plugin_args["use_orig_params"] = str_to_bool(
2769
+ str(self.fsdp_config.get("use_orig_params", "true")).lower()
2770
+ )
2749
2771
 
2750
2772
  sync_module_states = str(self.fsdp_config.get("sync_module_states", "true")).lower()
2751
2773
  cpu_ram_efficient_loading = str(self.fsdp_config.get("cpu_ram_efficient_loading", "false")).lower()
@@ -2755,11 +2777,10 @@ class TrainingArguments:
2755
2777
  raise ValueError('`sync_module_states` must be `"True"` if `cpu_ram_efficient_loading` is `"True"`')
2756
2778
 
2757
2779
  # we need to set the env here as otherwise we get a warning in accelerate + we need to set it for transformers
2758
- fsdp_plugin_args["cpu_ram_efficient_loading"] = cpu_ram_efficient_loading
2780
+ fsdp_plugin_args["cpu_ram_efficient_loading"] = str_to_bool(cpu_ram_efficient_loading)
2759
2781
  os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = cpu_ram_efficient_loading
2760
2782
 
2761
- fsdp_plugin_args["sync_module_states"] = sync_module_states
2762
- fsdp_plugin_args["use_orig_params"] = str(self.fsdp_config.get("use_orig_params", "true")).lower()
2783
+ fsdp_plugin_args["sync_module_states"] = str_to_bool(sync_module_states)
2763
2784
 
2764
2785
  return fsdp_plugin_args
2765
2786
 
@@ -2771,3 +2792,18 @@ class ParallelMode(Enum):
2771
2792
  SAGEMAKER_MODEL_PARALLEL = "sagemaker_model_parallel"
2772
2793
  SAGEMAKER_DATA_PARALLEL = "sagemaker_data_parallel"
2773
2794
  TPU = "tpu"
2795
+
2796
+
2797
+ def str_to_bool(value, to_bool: bool = True) -> int | bool:
2798
+ """
2799
+ Converts a string representation of truth to `True` (1) or `False` (0).
2800
+
2801
+ True values are `y`, `yes`, `t`, `true`, `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`;
2802
+ """
2803
+ value = value.lower()
2804
+ if value in ("y", "yes", "t", "true", "on", "1"):
2805
+ return 1 if not to_bool else True
2806
+ elif value in ("n", "no", "f", "false", "off", "0"):
2807
+ return 0 if not to_bool else False
2808
+ else:
2809
+ raise ValueError(f"invalid truth value {value}")
@@ -49,6 +49,7 @@ from .generic import (
49
49
  PaddingStrategy,
50
50
  TensorType,
51
51
  TransformersKwargs,
52
+ _is_tensor_or_array_like,
52
53
  can_return_loss,
53
54
  can_return_tuple,
54
55
  expand_dims,
@@ -91,7 +92,6 @@ from .hub import (
91
92
  extract_commit_hash,
92
93
  has_file,
93
94
  http_user_agent,
94
- is_offline_mode,
95
95
  list_repo_templates,
96
96
  try_to_load_from_cache,
97
97
  )
@@ -114,8 +114,6 @@ from .import_utils import (
114
114
  is_apex_available,
115
115
  is_apollo_torch_available,
116
116
  is_aqlm_available,
117
- is_auto_awq_available,
118
- is_auto_gptq_available,
119
117
  is_auto_round_available,
120
118
  is_av_available,
121
119
  is_bitsandbytes_available,
@@ -129,7 +127,8 @@ from .import_utils import (
129
127
  is_datasets_available,
130
128
  is_decord_available,
131
129
  is_detectron2_available,
132
- is_eetq_available,
130
+ is_env_variable_false,
131
+ is_env_variable_true,
133
132
  is_essentia_available,
134
133
  is_faiss_available,
135
134
  is_fbgemm_gpu_available,
@@ -146,6 +145,7 @@ from .import_utils import (
146
145
  is_gguf_available,
147
146
  is_gptqmodel_available,
148
147
  is_grokadamw_available,
148
+ is_grouped_mm_available,
149
149
  is_habana_gaudi1,
150
150
  is_hadamard_available,
151
151
  is_hqq_available,
@@ -161,6 +161,7 @@ from .import_utils import (
161
161
  is_libcst_available,
162
162
  is_librosa_available,
163
163
  is_liger_kernel_available,
164
+ is_llm_awq_available,
164
165
  is_lomo_available,
165
166
  is_matplotlib_available,
166
167
  is_mistral_common_available,
@@ -169,6 +170,7 @@ from .import_utils import (
169
170
  is_ninja_available,
170
171
  is_nltk_available,
171
172
  is_num2words_available,
173
+ is_numba_available,
172
174
  is_onnx_available,
173
175
  is_openai_available,
174
176
  is_optimum_available,
@@ -183,6 +185,7 @@ from .import_utils import (
183
185
  is_pyctcdecode_available,
184
186
  is_pytesseract_available,
185
187
  is_pytest_available,
188
+ is_pytest_order_available,
186
189
  is_pytorch_quantization_available,
187
190
  is_quanto_greater,
188
191
  is_quark_available,
@@ -21,7 +21,7 @@ from ..models.auto.auto_factory import _get_model_class
21
21
  from ..models.auto.configuration_auto import AutoConfig
22
22
  from ..models.auto.modeling_auto import MODEL_FOR_PRETRAINING_MAPPING, MODEL_MAPPING
23
23
  from ..models.auto.processing_auto import PROCESSOR_MAPPING_NAMES, AutoProcessor
24
- from ..models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES, AutoTokenizer
24
+ from ..models.auto.tokenization_auto import AutoTokenizer
25
25
  from .import_utils import is_torch_available
26
26
 
27
27
 
@@ -199,12 +199,12 @@ class AttentionMaskVisualizer:
199
199
  if "token_type_ids" in inputs: # TODO inspect signature of update causal mask
200
200
  kwargs["token_type_ids"] = inputs["token_type_ids"]
201
201
  tokens = processor.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
202
- elif self.config.model_type in TOKENIZER_MAPPING_NAMES:
202
+ else:
203
203
  tokenizer = AutoTokenizer.from_pretrained(self.repo_id)
204
204
  tokens = tokenizer.tokenize(input_sentence)
205
205
  attention_mask = tokenizer(input_sentence, return_tensors="pt")["attention_mask"]
206
- else:
207
- raise ValueError(f"Model type {model.config.model_type} does not support attention visualization")
206
+ if attention_mask is None:
207
+ raise ValueError(f"Model type {self.config.model_type} does not support attention visualization")
208
208
 
209
209
  model.config._attn_implementation = "eager"
210
210
  model.train()
@@ -17,7 +17,8 @@ import inspect
17
17
  import os
18
18
  import textwrap
19
19
  from pathlib import Path
20
- from typing import get_args
20
+ from types import UnionType
21
+ from typing import Union, get_args, get_origin
21
22
 
22
23
  import regex as re
23
24
 
@@ -67,6 +68,7 @@ HARDCODED_CONFIG_FOR_MODELS = {
67
68
  "donut": "DonutSwinConfig",
68
69
  "esmfold": "EsmConfig",
69
70
  "parakeet": "ParakeetCTCConfig",
71
+ "lasr": "LasrCTCConfig",
70
72
  }
71
73
 
72
74
  _re_checkpoint = re.compile(r"\[(.+?)\]\((https://huggingface\.co/.+?)\)")
@@ -1279,38 +1281,46 @@ def _get_model_info(func, parent_class):
1279
1281
  return model_name_lowercase, class_name, config_class
1280
1282
 
1281
1283
 
1282
- def _process_parameter_type(param, param_name, func):
1284
+ def _process_parameter_type(param):
1283
1285
  """
1284
1286
  Process and format a parameter's type annotation.
1285
1287
 
1286
1288
  Args:
1287
1289
  param (`inspect.Parameter`): The parameter from the function signature
1288
- param_name (`str`): The name of the parameter
1289
- func (`function`): The function the parameter belongs to
1290
1290
  """
1291
1291
  optional = False
1292
- if param.annotation != inspect.Parameter.empty:
1293
- param_type = param.annotation
1294
- if "typing" in str(param_type):
1295
- param_type = "".join(str(param_type).split("typing.")).replace("transformers.", "~")
1296
- elif hasattr(param_type, "__module__"):
1297
- param_type = f"{param_type.__module__.replace('transformers.', '~').replace('builtins', '')}.{param.annotation.__name__}"
1298
- if param_type[0] == ".":
1299
- param_type = param_type[1:]
1300
- else:
1301
- if False:
1302
- print(
1303
- f"[ERROR] {param_type} for {param_name} of {func.__qualname__} in file {func.__code__.co_filename} has an invalid type"
1304
- )
1305
- if "ForwardRef" in param_type:
1306
- param_type = re.sub(r"ForwardRef\('([\w.]+)'\)", r"\1", param_type)
1307
- if "Optional" in param_type:
1308
- param_type = re.sub(r"Optional\[(.*?)\]", r"\1", param_type)
1292
+ if param.annotation == inspect.Parameter.empty:
1293
+ return "", False
1294
+ elif param.annotation is None:
1295
+ return "None", True
1296
+ # This is, astonishingly, the right way to do it: https://docs.python.org/3/library/typing.html#typing.Union
1297
+ elif get_origin(param.annotation) is Union or get_origin(param.annotation) is UnionType:
1298
+ subtypes = get_args(param.annotation)
1299
+ else:
1300
+ subtypes = [param.annotation] # Just pretend it's a single-element union so we don't need two code paths
1301
+ out_str = []
1302
+ for subtype in subtypes:
1303
+ if subtype is type(None):
1309
1304
  optional = True
1305
+ continue
1306
+ if hasattr(subtype, "__module__") and hasattr(subtype, "__name__"):
1307
+ subtype = f"{subtype.__module__.replace('transformers.', '~').replace('builtins', '').replace('typing.', '')}.{subtype.__name__}".removeprefix(
1308
+ "."
1309
+ )
1310
+ else:
1311
+ subtype = str(subtype) # Just give up
1312
+ if "ForwardRef" in subtype:
1313
+ subtype = re.sub(r"ForwardRef\('([\w.]+)'\)", r"\1", subtype)
1314
+ out_str.append(subtype)
1315
+
1316
+ if param.default is not inspect.Parameter.empty:
1317
+ optional = True
1318
+ if not out_str:
1319
+ return "", optional
1320
+ elif len(out_str) == 1:
1321
+ return out_str[0], optional
1310
1322
  else:
1311
- param_type = ""
1312
-
1313
- return param_type, optional
1323
+ return f"Union[{', '.join(out_str)}]", optional
1314
1324
 
1315
1325
 
1316
1326
  def _get_parameter_info(param_name, documented_params, source_args_dict, param_type, optional):
@@ -1391,7 +1401,7 @@ def _process_regular_parameters(
1391
1401
  continue
1392
1402
 
1393
1403
  # Process parameter type and optional status
1394
- param_type, optional = _process_parameter_type(param, param_name, func)
1404
+ param_type, optional = _process_parameter_type(param)
1395
1405
 
1396
1406
  # Check for default value
1397
1407
  param_default = ""
@@ -21,7 +21,7 @@ import os
21
21
  import warnings
22
22
  from collections import OrderedDict, UserDict, defaultdict
23
23
  from collections.abc import Callable, Iterable, MutableMapping
24
- from contextlib import AbstractContextManager, ExitStack
24
+ from contextlib import AbstractContextManager, ExitStack, nullcontext
25
25
  from dataclasses import dataclass, fields, is_dataclass
26
26
  from enum import Enum
27
27
  from functools import partial, wraps
@@ -42,6 +42,7 @@ _is_torch_available = False
42
42
  if is_torch_available():
43
43
  # required for @can_return_tuple decorator to work with torchdynamo
44
44
  import torch
45
+ from torch.types import _dtype
45
46
 
46
47
  from ..model_debugging_utils import model_addition_debugger_context
47
48
 
@@ -154,6 +155,48 @@ def is_torch_dtype(x):
154
155
  return isinstance(x, torch.dtype)
155
156
 
156
157
 
158
+ def _is_tensor_or_array_like(value):
159
+ """
160
+ Check if a value is array-like (includes ragged arrays)
161
+ """
162
+ if is_numpy_array(value):
163
+ return True
164
+ if is_torch_tensor(value):
165
+ return True
166
+ if isinstance(value, (int, float, bool, np.number)):
167
+ return True
168
+
169
+ if isinstance(value, (list, tuple)):
170
+ if len(value) == 0:
171
+ # consider empty list or nested list as array-like
172
+ return True
173
+ return _is_tensor_or_array_like(value[0])
174
+
175
+ return False
176
+
177
+
178
+ def maybe_autocast(
179
+ device_type: str,
180
+ dtype: Optional["_dtype"] = None,
181
+ enabled: bool = True,
182
+ cache_enabled: Optional[bool] = None,
183
+ ):
184
+ """
185
+ Context manager that only autocasts if:
186
+
187
+ - `autocast` is already enabled in this context
188
+ - Or this call to `maybe_autocast` has `enabled=True`
189
+
190
+ This prevents `autocast` being added to the graph when it is effectively a no-op.
191
+ Which makes graph splitting in `torch.compile` more flexible as it removes the
192
+ requirement that partition IDs be monotonically increasing.
193
+ """
194
+ if torch.is_autocast_enabled(device_type) or enabled:
195
+ return torch.autocast(device_type, dtype=dtype, enabled=enabled, cache_enabled=cache_enabled)
196
+ else:
197
+ return nullcontext()
198
+
199
+
157
200
  def _is_mlx(x):
158
201
  import mlx.core as mx
159
202
 
@@ -680,6 +723,8 @@ class TransformersKwargs(TypedDict, total=False):
680
723
  Maximum sequence length for query state.
681
724
  max_length_k (`int`, *optional*):
682
725
  Maximum sequence length for key state.
726
+ position_ids (`torch.LongTensor`, *optional*)
727
+ Indices of positions of each input sequence tokens.
683
728
  """
684
729
 
685
730
  num_items_in_batch: Optional["torch.Tensor"]
@@ -690,6 +735,7 @@ class TransformersKwargs(TypedDict, total=False):
690
735
  cu_seq_lens_k: Optional["torch.LongTensor"]
691
736
  max_length_q: int | None
692
737
  max_length_k: int | None
738
+ position_ids: Optional["torch.LongTensor"]
693
739
 
694
740
 
695
741
  def is_timm_config_dict(config_dict: dict[str, Any]) -> bool: