transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (539) hide show
  1. transformers/__init__.py +30 -3
  2. transformers/cli/serve.py +47 -17
  3. transformers/conversion_mapping.py +15 -2
  4. transformers/convert_slow_tokenizer.py +225 -10
  5. transformers/core_model_loading.py +196 -135
  6. transformers/data/data_collator.py +12 -4
  7. transformers/dependency_versions_table.py +1 -2
  8. transformers/dynamic_module_utils.py +1 -2
  9. transformers/feature_extraction_utils.py +1 -2
  10. transformers/file_utils.py +0 -1
  11. transformers/generation/__init__.py +11 -1
  12. transformers/generation/configuration_utils.py +3 -2
  13. transformers/generation/continuous_batching/__init__.py +4 -0
  14. transformers/generation/continuous_batching/continuous_api.py +134 -79
  15. transformers/image_processing_base.py +1 -2
  16. transformers/integrations/__init__.py +4 -2
  17. transformers/integrations/accelerate.py +15 -3
  18. transformers/integrations/aqlm.py +38 -66
  19. transformers/integrations/awq.py +48 -514
  20. transformers/integrations/bitnet.py +45 -100
  21. transformers/integrations/bitsandbytes.py +79 -191
  22. transformers/integrations/deepspeed.py +1 -0
  23. transformers/integrations/eetq.py +84 -79
  24. transformers/integrations/fbgemm_fp8.py +191 -145
  25. transformers/integrations/finegrained_fp8.py +236 -193
  26. transformers/integrations/fp_quant.py +92 -0
  27. transformers/integrations/ggml.py +11 -1
  28. transformers/integrations/higgs.py +40 -62
  29. transformers/integrations/hub_kernels.py +42 -3
  30. transformers/integrations/integration_utils.py +10 -0
  31. transformers/integrations/mxfp4.py +25 -65
  32. transformers/integrations/peft.py +7 -29
  33. transformers/integrations/quanto.py +73 -55
  34. transformers/integrations/quark.py +55 -0
  35. transformers/integrations/spqr.py +44 -90
  36. transformers/integrations/torchao.py +32 -38
  37. transformers/integrations/vptq.py +42 -59
  38. transformers/modelcard.py +1 -2
  39. transformers/modeling_gguf_pytorch_utils.py +8 -0
  40. transformers/modeling_rope_utils.py +30 -6
  41. transformers/modeling_utils.py +116 -112
  42. transformers/models/__init__.py +3 -0
  43. transformers/models/afmoe/modeling_afmoe.py +4 -4
  44. transformers/models/albert/tokenization_albert.py +6 -12
  45. transformers/models/align/modeling_align.py +2 -0
  46. transformers/models/altclip/modeling_altclip.py +4 -0
  47. transformers/models/apertus/modeling_apertus.py +4 -4
  48. transformers/models/arcee/modeling_arcee.py +4 -4
  49. transformers/models/aria/modeling_aria.py +4 -4
  50. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  51. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  52. transformers/models/auto/configuration_auto.py +11 -0
  53. transformers/models/auto/feature_extraction_auto.py +2 -0
  54. transformers/models/auto/image_processing_auto.py +1 -0
  55. transformers/models/auto/modeling_auto.py +6 -0
  56. transformers/models/auto/processing_auto.py +18 -10
  57. transformers/models/auto/tokenization_auto.py +74 -472
  58. transformers/models/autoformer/modeling_autoformer.py +4 -0
  59. transformers/models/bamba/modeling_bamba.py +4 -3
  60. transformers/models/bark/modeling_bark.py +2 -0
  61. transformers/models/bart/modeling_bart.py +7 -0
  62. transformers/models/barthez/tokenization_barthez.py +5 -10
  63. transformers/models/beit/modeling_beit.py +6 -1
  64. transformers/models/bert/tokenization_bert.py +8 -21
  65. transformers/models/big_bird/modeling_big_bird.py +6 -0
  66. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  67. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
  68. transformers/models/biogpt/modeling_biogpt.py +2 -0
  69. transformers/models/biogpt/modular_biogpt.py +2 -0
  70. transformers/models/bit/modeling_bit.py +11 -2
  71. transformers/models/bitnet/modeling_bitnet.py +4 -4
  72. transformers/models/blenderbot/modeling_blenderbot.py +5 -0
  73. transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
  74. transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
  75. transformers/models/blip/modeling_blip_text.py +2 -0
  76. transformers/models/blip_2/modeling_blip_2.py +2 -1
  77. transformers/models/bloom/modeling_bloom.py +4 -0
  78. transformers/models/blt/modeling_blt.py +2 -2
  79. transformers/models/blt/modular_blt.py +2 -2
  80. transformers/models/bridgetower/modeling_bridgetower.py +5 -1
  81. transformers/models/bros/modeling_bros.py +4 -0
  82. transformers/models/camembert/tokenization_camembert.py +8 -12
  83. transformers/models/canine/modeling_canine.py +5 -0
  84. transformers/models/chameleon/modeling_chameleon.py +2 -1
  85. transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
  86. transformers/models/clap/modeling_clap.py +5 -0
  87. transformers/models/clip/tokenization_clip.py +22 -44
  88. transformers/models/clipseg/modeling_clipseg.py +5 -0
  89. transformers/models/clvp/modeling_clvp.py +5 -0
  90. transformers/models/clvp/tokenization_clvp.py +1 -63
  91. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  92. transformers/models/codegen/tokenization_codegen.py +14 -43
  93. transformers/models/cohere/modeling_cohere.py +4 -3
  94. transformers/models/cohere/modular_cohere.py +2 -1
  95. transformers/models/cohere/tokenization_cohere.py +12 -42
  96. transformers/models/cohere2/modeling_cohere2.py +7 -6
  97. transformers/models/cohere2/modular_cohere2.py +5 -5
  98. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
  99. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  100. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  101. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  102. transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
  103. transformers/models/convbert/modeling_convbert.py +6 -0
  104. transformers/models/convnext/modeling_convnext.py +2 -4
  105. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  106. transformers/models/csm/modeling_csm.py +4 -3
  107. transformers/models/ctrl/modeling_ctrl.py +1 -0
  108. transformers/models/cvt/modeling_cvt.py +2 -0
  109. transformers/models/cwm/modeling_cwm.py +4 -4
  110. transformers/models/d_fine/modeling_d_fine.py +2 -0
  111. transformers/models/d_fine/modular_d_fine.py +1 -0
  112. transformers/models/dab_detr/modeling_dab_detr.py +4 -0
  113. transformers/models/dac/modeling_dac.py +2 -2
  114. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  115. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  116. transformers/models/dbrx/modeling_dbrx.py +2 -2
  117. transformers/models/deberta/modeling_deberta.py +5 -0
  118. transformers/models/deberta/tokenization_deberta.py +11 -20
  119. transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
  120. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  121. transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
  122. transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
  123. transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
  124. transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
  125. transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
  126. transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
  127. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  128. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  129. transformers/models/detr/modeling_detr.py +5 -0
  130. transformers/models/dia/modeling_dia.py +4 -3
  131. transformers/models/dia/modular_dia.py +0 -1
  132. transformers/models/diffllama/modeling_diffllama.py +2 -2
  133. transformers/models/dinat/modeling_dinat.py +3 -0
  134. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  135. transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
  136. transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
  137. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  138. transformers/models/doge/modeling_doge.py +2 -3
  139. transformers/models/doge/modular_doge.py +0 -1
  140. transformers/models/donut/modeling_donut_swin.py +2 -0
  141. transformers/models/dots1/modeling_dots1.py +10 -7
  142. transformers/models/dots1/modular_dots1.py +5 -3
  143. transformers/models/dpr/modeling_dpr.py +5 -0
  144. transformers/models/dpr/tokenization_dpr.py +12 -0
  145. transformers/models/edgetam/modeling_edgetam.py +1 -1
  146. transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
  147. transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
  148. transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
  149. transformers/models/efficientnet/modeling_efficientnet.py +2 -0
  150. transformers/models/emu3/modeling_emu3.py +4 -4
  151. transformers/models/eomt/image_processing_eomt.py +13 -1
  152. transformers/models/eomt/image_processing_eomt_fast.py +14 -2
  153. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  154. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  155. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
  156. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
  157. transformers/models/esm/modeling_esmfold.py +5 -4
  158. transformers/models/evolla/modeling_evolla.py +4 -4
  159. transformers/models/exaone4/modeling_exaone4.py +2 -2
  160. transformers/models/exaone4/modular_exaone4.py +0 -1
  161. transformers/models/falcon/modeling_falcon.py +6 -1
  162. transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
  163. transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
  164. transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
  165. transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
  166. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  167. transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
  168. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  169. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
  170. transformers/models/flaubert/modeling_flaubert.py +7 -0
  171. transformers/models/flava/modeling_flava.py +6 -1
  172. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
  173. transformers/models/florence2/modeling_florence2.py +2 -1
  174. transformers/models/florence2/modular_florence2.py +2 -1
  175. transformers/models/fnet/modeling_fnet.py +7 -0
  176. transformers/models/focalnet/modeling_focalnet.py +4 -0
  177. transformers/models/fsmt/modeling_fsmt.py +2 -0
  178. transformers/models/funnel/modeling_funnel.py +8 -0
  179. transformers/models/funnel/tokenization_funnel.py +17 -24
  180. transformers/models/fuyu/processing_fuyu.py +3 -3
  181. transformers/models/gemma/modeling_gemma.py +4 -4
  182. transformers/models/gemma/tokenization_gemma.py +10 -27
  183. transformers/models/gemma2/modeling_gemma2.py +4 -4
  184. transformers/models/gemma2/modular_gemma2.py +2 -1
  185. transformers/models/gemma3/modeling_gemma3.py +14 -84
  186. transformers/models/gemma3/modular_gemma3.py +12 -81
  187. transformers/models/gemma3n/modeling_gemma3n.py +18 -209
  188. transformers/models/gemma3n/modular_gemma3n.py +17 -59
  189. transformers/models/git/modeling_git.py +2 -0
  190. transformers/models/glm/modeling_glm.py +4 -4
  191. transformers/models/glm4/modeling_glm4.py +4 -4
  192. transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
  193. transformers/models/glm4v/configuration_glm4v.py +3 -1
  194. transformers/models/glm4v/modeling_glm4v.py +3 -3
  195. transformers/models/glm4v/modular_glm4v.py +6 -4
  196. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  197. transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
  198. transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
  199. transformers/models/glpn/modeling_glpn.py +2 -0
  200. transformers/models/gpt2/modeling_gpt2.py +5 -1
  201. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  202. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
  203. transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
  204. transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
  205. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  206. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  207. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
  208. transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
  209. transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
  210. transformers/models/gptj/modeling_gptj.py +3 -0
  211. transformers/models/granite/modeling_granite.py +4 -4
  212. transformers/models/granitemoe/modeling_granitemoe.py +4 -6
  213. transformers/models/granitemoe/modular_granitemoe.py +0 -2
  214. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
  215. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
  216. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
  217. transformers/models/groupvit/modeling_groupvit.py +3 -0
  218. transformers/models/helium/modeling_helium.py +4 -3
  219. transformers/models/herbert/tokenization_herbert.py +9 -25
  220. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
  221. transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
  222. transformers/models/hiera/modeling_hiera.py +4 -0
  223. transformers/models/hubert/modeling_hubert.py +3 -0
  224. transformers/models/hubert/modular_hubert.py +1 -0
  225. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
  226. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
  227. transformers/models/ibert/modeling_ibert.py +6 -0
  228. transformers/models/idefics/modeling_idefics.py +5 -21
  229. transformers/models/imagegpt/modeling_imagegpt.py +2 -1
  230. transformers/models/informer/modeling_informer.py +4 -0
  231. transformers/models/informer/modular_informer.py +1 -0
  232. transformers/models/internvl/modeling_internvl.py +2 -4
  233. transformers/models/internvl/modular_internvl.py +2 -4
  234. transformers/models/jamba/modeling_jamba.py +2 -2
  235. transformers/models/janus/modeling_janus.py +1 -0
  236. transformers/models/janus/modular_janus.py +1 -0
  237. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  238. transformers/models/kosmos2/modeling_kosmos2.py +1 -0
  239. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
  240. transformers/models/lasr/__init__.py +29 -0
  241. transformers/models/lasr/configuration_lasr.py +244 -0
  242. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  243. transformers/models/lasr/modeling_lasr.py +729 -0
  244. transformers/models/lasr/modular_lasr.py +569 -0
  245. transformers/models/lasr/processing_lasr.py +96 -0
  246. transformers/models/lasr/tokenization_lasr.py +186 -0
  247. transformers/models/layoutlm/modeling_layoutlm.py +5 -0
  248. transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
  249. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
  250. transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
  251. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  252. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  253. transformers/models/led/modeling_led.py +6 -0
  254. transformers/models/levit/modeling_levit.py +3 -0
  255. transformers/models/lfm2/modeling_lfm2.py +4 -5
  256. transformers/models/lfm2/modular_lfm2.py +0 -1
  257. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
  258. transformers/models/lightglue/modeling_lightglue.py +3 -1
  259. transformers/models/lightglue/modular_lightglue.py +1 -0
  260. transformers/models/lilt/modeling_lilt.py +4 -0
  261. transformers/models/llama/modeling_llama.py +4 -4
  262. transformers/models/llama/tokenization_llama.py +15 -43
  263. transformers/models/llama4/modeling_llama4.py +3 -2
  264. transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
  265. transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
  266. transformers/models/longformer/modeling_longformer.py +6 -0
  267. transformers/models/longt5/modeling_longt5.py +4 -0
  268. transformers/models/luke/modeling_luke.py +9 -0
  269. transformers/models/luke/tokenization_luke.py +11 -38
  270. transformers/models/lxmert/modeling_lxmert.py +2 -0
  271. transformers/models/m2m_100/modeling_m2m_100.py +4 -0
  272. transformers/models/mamba/modeling_mamba.py +14 -22
  273. transformers/models/marian/modeling_marian.py +5 -0
  274. transformers/models/markuplm/modeling_markuplm.py +4 -0
  275. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  276. transformers/models/mask2former/modeling_mask2former.py +2 -0
  277. transformers/models/maskformer/modeling_maskformer.py +2 -0
  278. transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
  279. transformers/models/mbart/modeling_mbart.py +7 -0
  280. transformers/models/mbart/tokenization_mbart.py +11 -52
  281. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  282. transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
  283. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  284. transformers/models/mimi/modeling_mimi.py +3 -1
  285. transformers/models/minimax/modeling_minimax.py +4 -4
  286. transformers/models/ministral/modeling_ministral.py +4 -4
  287. transformers/models/ministral3/configuration_ministral3.py +1 -1
  288. transformers/models/ministral3/modeling_ministral3.py +4 -3
  289. transformers/models/mistral/modeling_mistral.py +4 -3
  290. transformers/models/mixtral/modeling_mixtral.py +4 -4
  291. transformers/models/mllama/modeling_mllama.py +2 -2
  292. transformers/models/mluke/tokenization_mluke.py +6 -6
  293. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
  294. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  295. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  296. transformers/models/mobilevit/modeling_mobilevit.py +3 -0
  297. transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
  298. transformers/models/modernbert/modeling_modernbert.py +4 -1
  299. transformers/models/modernbert/modular_modernbert.py +2 -0
  300. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
  301. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
  302. transformers/models/moonshine/modeling_moonshine.py +4 -2
  303. transformers/models/moshi/modeling_moshi.py +5 -2
  304. transformers/models/mpnet/modeling_mpnet.py +5 -0
  305. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  306. transformers/models/mpt/modeling_mpt.py +2 -0
  307. transformers/models/mra/modeling_mra.py +6 -0
  308. transformers/models/mt5/modeling_mt5.py +7 -0
  309. transformers/models/musicgen/modeling_musicgen.py +2 -0
  310. transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
  311. transformers/models/mvp/modeling_mvp.py +7 -0
  312. transformers/models/nanochat/modeling_nanochat.py +4 -4
  313. transformers/models/nemotron/modeling_nemotron.py +4 -2
  314. transformers/models/nllb/tokenization_nllb.py +8 -22
  315. transformers/models/nougat/tokenization_nougat.py +11 -59
  316. transformers/models/nystromformer/modeling_nystromformer.py +6 -0
  317. transformers/models/olmo/modeling_olmo.py +4 -4
  318. transformers/models/olmo/modular_olmo.py +2 -2
  319. transformers/models/olmo2/modeling_olmo2.py +4 -5
  320. transformers/models/olmo2/modular_olmo2.py +0 -1
  321. transformers/models/olmo3/modeling_olmo3.py +4 -4
  322. transformers/models/olmoe/modeling_olmoe.py +4 -4
  323. transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
  324. transformers/models/oneformer/modeling_oneformer.py +4 -1
  325. transformers/models/openai/modeling_openai.py +3 -0
  326. transformers/models/openai/tokenization_openai.py +10 -46
  327. transformers/models/opt/modeling_opt.py +2 -0
  328. transformers/models/owlv2/modeling_owlv2.py +4 -0
  329. transformers/models/owlvit/modeling_owlvit.py +4 -0
  330. transformers/models/paddleocr_vl/__init__.py +32 -0
  331. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  332. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
  333. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  334. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
  335. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
  336. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  337. transformers/models/parakeet/configuration_parakeet.py +4 -6
  338. transformers/models/parakeet/modeling_parakeet.py +9 -6
  339. transformers/models/parakeet/modular_parakeet.py +2 -2
  340. transformers/models/parakeet/processing_parakeet.py +1 -0
  341. transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
  342. transformers/models/patchtst/modeling_patchtst.py +20 -2
  343. transformers/models/pegasus/modeling_pegasus.py +5 -0
  344. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  345. transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
  346. transformers/models/perceiver/modeling_perceiver.py +8 -0
  347. transformers/models/persimmon/modeling_persimmon.py +2 -1
  348. transformers/models/phi/modeling_phi.py +4 -5
  349. transformers/models/phi/modular_phi.py +0 -1
  350. transformers/models/phi3/modeling_phi3.py +2 -1
  351. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
  352. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
  353. transformers/models/phimoe/modeling_phimoe.py +4 -4
  354. transformers/models/phimoe/modular_phimoe.py +2 -2
  355. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  356. transformers/models/pixtral/modeling_pixtral.py +2 -1
  357. transformers/models/plbart/modeling_plbart.py +6 -0
  358. transformers/models/plbart/modular_plbart.py +2 -0
  359. transformers/models/plbart/tokenization_plbart.py +0 -2
  360. transformers/models/poolformer/modeling_poolformer.py +2 -0
  361. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  362. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  363. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  364. transformers/models/prophetnet/modeling_prophetnet.py +3 -0
  365. transformers/models/pvt/modeling_pvt.py +2 -0
  366. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  367. transformers/models/qwen2/modeling_qwen2.py +4 -4
  368. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  369. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  370. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
  371. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
  372. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  373. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
  374. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
  375. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
  376. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  377. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  378. transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
  379. transformers/models/qwen3/modeling_qwen3.py +4 -4
  380. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  381. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
  382. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
  383. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
  384. transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
  385. transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
  386. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
  387. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
  388. transformers/models/rag/modeling_rag.py +1 -0
  389. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
  390. transformers/models/reformer/modeling_reformer.py +4 -0
  391. transformers/models/reformer/tokenization_reformer.py +11 -28
  392. transformers/models/regnet/modeling_regnet.py +6 -1
  393. transformers/models/rembert/modeling_rembert.py +6 -0
  394. transformers/models/rembert/tokenization_rembert.py +3 -10
  395. transformers/models/resnet/modeling_resnet.py +11 -2
  396. transformers/models/roberta/tokenization_roberta.py +18 -27
  397. transformers/models/roformer/modeling_roformer.py +6 -0
  398. transformers/models/roformer/tokenization_roformer.py +77 -412
  399. transformers/models/rt_detr/modeling_rt_detr.py +2 -0
  400. transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
  401. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
  402. transformers/models/rwkv/modeling_rwkv.py +1 -0
  403. transformers/models/sam2/modeling_sam2.py +2 -2
  404. transformers/models/sam2/modular_sam2.py +2 -2
  405. transformers/models/sam2_video/modeling_sam2_video.py +1 -0
  406. transformers/models/sam2_video/modular_sam2_video.py +1 -0
  407. transformers/models/sam3/modeling_sam3.py +77 -80
  408. transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
  409. transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
  410. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
  411. transformers/models/sam3_video/modeling_sam3_video.py +1 -0
  412. transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
  413. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  414. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
  415. transformers/models/seed_oss/modeling_seed_oss.py +2 -2
  416. transformers/models/segformer/modeling_segformer.py +4 -1
  417. transformers/models/seggpt/modeling_seggpt.py +2 -0
  418. transformers/models/sew/modeling_sew.py +3 -0
  419. transformers/models/sew/modular_sew.py +1 -0
  420. transformers/models/sew_d/modeling_sew_d.py +3 -0
  421. transformers/models/siglip2/modeling_siglip2.py +4 -0
  422. transformers/models/siglip2/modular_siglip2.py +4 -0
  423. transformers/models/smollm3/modeling_smollm3.py +4 -4
  424. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  425. transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
  426. transformers/models/speecht5/modeling_speecht5.py +13 -1
  427. transformers/models/splinter/modeling_splinter.py +3 -0
  428. transformers/models/splinter/tokenization_splinter.py +9 -28
  429. transformers/models/squeezebert/modeling_squeezebert.py +6 -0
  430. transformers/models/stablelm/modeling_stablelm.py +3 -1
  431. transformers/models/starcoder2/modeling_starcoder2.py +4 -3
  432. transformers/models/superglue/modeling_superglue.py +1 -0
  433. transformers/models/superpoint/modeling_superpoint.py +1 -0
  434. transformers/models/swiftformer/modeling_swiftformer.py +2 -0
  435. transformers/models/swin/modeling_swin.py +4 -0
  436. transformers/models/swin2sr/modeling_swin2sr.py +2 -0
  437. transformers/models/swinv2/modeling_swinv2.py +4 -0
  438. transformers/models/t5/modeling_t5.py +7 -0
  439. transformers/models/t5/tokenization_t5.py +4 -8
  440. transformers/models/t5gemma/modeling_t5gemma.py +5 -5
  441. transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
  442. transformers/models/table_transformer/modeling_table_transformer.py +4 -0
  443. transformers/models/tapas/modeling_tapas.py +3 -0
  444. transformers/models/textnet/modeling_textnet.py +11 -2
  445. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  446. transformers/models/timesfm/modeling_timesfm.py +2 -0
  447. transformers/models/timesfm/modular_timesfm.py +2 -0
  448. transformers/models/timesformer/modeling_timesformer.py +2 -0
  449. transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
  450. transformers/models/trocr/modeling_trocr.py +2 -0
  451. transformers/models/tvp/modeling_tvp.py +2 -0
  452. transformers/models/udop/modeling_udop.py +4 -0
  453. transformers/models/udop/tokenization_udop.py +5 -13
  454. transformers/models/umt5/modeling_umt5.py +7 -0
  455. transformers/models/unispeech/modeling_unispeech.py +4 -0
  456. transformers/models/unispeech/modular_unispeech.py +2 -0
  457. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  458. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  459. transformers/models/univnet/modeling_univnet.py +1 -0
  460. transformers/models/upernet/modeling_upernet.py +1 -0
  461. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  462. transformers/models/vilt/modeling_vilt.py +6 -0
  463. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  464. transformers/models/visual_bert/modeling_visual_bert.py +6 -0
  465. transformers/models/vitdet/modeling_vitdet.py +2 -0
  466. transformers/models/vitmatte/modeling_vitmatte.py +1 -0
  467. transformers/models/vits/modeling_vits.py +1 -0
  468. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  469. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  470. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
  471. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
  472. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
  473. transformers/models/wavlm/modeling_wavlm.py +5 -0
  474. transformers/models/whisper/modeling_whisper.py +6 -0
  475. transformers/models/whisper/tokenization_whisper.py +4 -15
  476. transformers/models/x_clip/modeling_x_clip.py +3 -0
  477. transformers/models/xglm/modeling_xglm.py +1 -0
  478. transformers/models/xglm/tokenization_xglm.py +4 -9
  479. transformers/models/xlm/modeling_xlm.py +5 -0
  480. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  481. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  482. transformers/models/yoso/modeling_yoso.py +6 -0
  483. transformers/models/zamba/modeling_zamba.py +2 -0
  484. transformers/models/zamba2/modeling_zamba2.py +4 -2
  485. transformers/models/zamba2/modular_zamba2.py +1 -1
  486. transformers/models/zoedepth/modeling_zoedepth.py +1 -0
  487. transformers/pipelines/__init__.py +2 -3
  488. transformers/pipelines/base.py +1 -9
  489. transformers/pipelines/document_question_answering.py +3 -1
  490. transformers/pipelines/text_generation.py +1 -1
  491. transformers/processing_utils.py +23 -11
  492. transformers/quantizers/base.py +35 -110
  493. transformers/quantizers/quantizer_aqlm.py +1 -5
  494. transformers/quantizers/quantizer_auto_round.py +1 -2
  495. transformers/quantizers/quantizer_awq.py +17 -81
  496. transformers/quantizers/quantizer_bitnet.py +3 -8
  497. transformers/quantizers/quantizer_bnb_4bit.py +13 -110
  498. transformers/quantizers/quantizer_bnb_8bit.py +16 -92
  499. transformers/quantizers/quantizer_compressed_tensors.py +1 -5
  500. transformers/quantizers/quantizer_eetq.py +14 -62
  501. transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
  502. transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
  503. transformers/quantizers/quantizer_fp_quant.py +48 -78
  504. transformers/quantizers/quantizer_gptq.py +7 -24
  505. transformers/quantizers/quantizer_higgs.py +40 -54
  506. transformers/quantizers/quantizer_hqq.py +144 -153
  507. transformers/quantizers/quantizer_mxfp4.py +13 -167
  508. transformers/quantizers/quantizer_quanto.py +20 -64
  509. transformers/quantizers/quantizer_quark.py +36 -17
  510. transformers/quantizers/quantizer_spqr.py +1 -4
  511. transformers/quantizers/quantizer_torchao.py +23 -202
  512. transformers/quantizers/quantizer_vptq.py +8 -22
  513. transformers/quantizers/quantizers_utils.py +20 -0
  514. transformers/testing_utils.py +297 -36
  515. transformers/tokenization_mistral_common.py +4 -0
  516. transformers/tokenization_utils_base.py +113 -222
  517. transformers/tokenization_utils_tokenizers.py +168 -107
  518. transformers/trainer.py +28 -31
  519. transformers/trainer_jit_checkpoint.py +126 -0
  520. transformers/trainer_utils.py +1 -1
  521. transformers/training_args.py +66 -28
  522. transformers/utils/__init__.py +3 -4
  523. transformers/utils/auto_docstring.py +1 -0
  524. transformers/utils/generic.py +27 -1
  525. transformers/utils/hub.py +5 -15
  526. transformers/utils/import_utils.py +61 -16
  527. transformers/utils/kernel_config.py +4 -2
  528. transformers/utils/loading_report.py +19 -10
  529. transformers/utils/quantization_config.py +75 -242
  530. transformers/video_processing_utils.py +1 -2
  531. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
  532. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
  533. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
  534. transformers/kernels/__init__.py +0 -0
  535. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  536. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  537. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
  538. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
  539. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
@@ -38,6 +38,7 @@ import types
38
38
  import unittest
39
39
  from collections import UserDict, defaultdict
40
40
  from collections.abc import Callable, Generator, Iterable, Iterator, Mapping
41
+ from contextlib import contextmanager
41
42
  from dataclasses import MISSING, fields
42
43
  from functools import cache, wraps
43
44
  from io import StringIO
@@ -72,13 +73,13 @@ from .integrations.deepspeed import is_deepspeed_available
72
73
  from .utils import (
73
74
  ACCELERATE_MIN_VERSION,
74
75
  GGUF_MIN_VERSION,
76
+ SAFE_WEIGHTS_INDEX_NAME,
75
77
  TRITON_MIN_VERSION,
78
+ WEIGHTS_INDEX_NAME,
76
79
  is_accelerate_available,
77
80
  is_apex_available,
78
81
  is_apollo_torch_available,
79
82
  is_aqlm_available,
80
- is_auto_awq_available,
81
- is_auto_gptq_available,
82
83
  is_auto_round_available,
83
84
  is_av_available,
84
85
  is_bitsandbytes_available,
@@ -88,7 +89,6 @@ from .utils import (
88
89
  is_cython_available,
89
90
  is_decord_available,
90
91
  is_detectron2_available,
91
- is_eetq_available,
92
92
  is_essentia_available,
93
93
  is_faiss_available,
94
94
  is_fbgemm_gpu_available,
@@ -219,14 +219,19 @@ _COMMON_MODEL_NAMES_MAP = {
219
219
 
220
220
  if is_torch_available():
221
221
  import torch
222
+ from safetensors.torch import load_file
223
+
224
+ from .modeling_utils import PreTrainedModel
222
225
 
223
226
  IS_ROCM_SYSTEM = torch.version.hip is not None
224
227
  IS_CUDA_SYSTEM = torch.version.cuda is not None
225
228
  IS_XPU_SYSTEM = getattr(torch.version, "xpu", None) is not None
229
+ IS_NPU_SYSTEM = getattr(torch, "npu", None) is not None
226
230
  else:
227
231
  IS_ROCM_SYSTEM = False
228
232
  IS_CUDA_SYSTEM = False
229
233
  IS_XPU_SYSTEM = False
234
+ IS_NPU_SYSTEM = False
230
235
 
231
236
  logger = transformers_logging.get_logger(__name__)
232
237
 
@@ -266,6 +271,7 @@ _run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=Fa
266
271
  _run_staging = parse_flag_from_env("HUGGINGFACE_CO_STAGING", default=False)
267
272
  _run_pipeline_tests = parse_flag_from_env("RUN_PIPELINE_TESTS", default=True)
268
273
  _run_agent_tests = parse_flag_from_env("RUN_AGENT_TESTS", default=False)
274
+ _run_training_tests = parse_flag_from_env("RUN_TRAINING_TESTS", default=True)
269
275
 
270
276
 
271
277
  def is_staging_test(test_case):
@@ -316,6 +322,22 @@ def is_agent_test(test_case):
316
322
  return pytest.mark.is_agent_test()(test_case)
317
323
 
318
324
 
325
+ def is_training_test(test_case):
326
+ """
327
+ Decorator marking a test as a training test. If RUN_TRAINING_TESTS is set to a falsy value, those tests will be
328
+ skipped.
329
+ """
330
+ if not _run_training_tests:
331
+ return unittest.skip(reason="test is training test")(test_case)
332
+ else:
333
+ try:
334
+ import pytest # We don't need a hard dependency on pytest in the main library
335
+ except ImportError:
336
+ return test_case
337
+ else:
338
+ return pytest.mark.is_training_test()(test_case)
339
+
340
+
319
341
  def slow(test_case):
320
342
  """
321
343
  Decorator marking a test as slow.
@@ -637,6 +659,9 @@ def require_read_token(test_case):
637
659
  if getattr(attr, "__require_read_token__", False):
638
660
  continue
639
661
  wrapped = require_read_token(attr)
662
+ if isinstance(inspect.getattr_static(test_case, attr_name), staticmethod):
663
+ # Don't accidentally bind staticmethods to `self`
664
+ wrapped = staticmethod(wrapped)
640
665
  setattr(test_case, attr_name, wrapped)
641
666
  return test_case
642
667
  else:
@@ -649,10 +674,6 @@ def require_read_token(test_case):
649
674
  with patch("huggingface_hub.utils._headers.get_token", return_value=token):
650
675
  return test_case(*args, **kwargs)
651
676
  else: # Allow running locally with the default token env variable
652
- # dealing with static/class methods and called by `self.xxx`
653
- if "staticmethod" in inspect.getsource(test_case).strip():
654
- if len(args) > 0 and isinstance(args[0], unittest.TestCase):
655
- return test_case(*args[1:], **kwargs)
656
677
  return test_case(*args, **kwargs)
657
678
 
658
679
  wrapper.__require_read_token__ = True
@@ -1239,23 +1260,6 @@ def require_spqr(test_case):
1239
1260
  return unittest.skipUnless(is_spqr_available(), "test requires spqr")(test_case)
1240
1261
 
1241
1262
 
1242
- def require_eetq(test_case):
1243
- """
1244
- Decorator marking a test that requires eetq
1245
- """
1246
- eetq_available = is_eetq_available()
1247
- if eetq_available:
1248
- try:
1249
- import eetq # noqa: F401
1250
- except ImportError as exc:
1251
- if "shard_checkpoint" in str(exc):
1252
- # EETQ 1.0.0 is currently broken with the latest transformers because it tries to import the removed
1253
- # shard_checkpoint function, see https://github.com/NetEase-FuXi/EETQ/issues/34.
1254
- # TODO: Remove once eetq releases a fix and this release is used in CI
1255
- eetq_available = False
1256
- return unittest.skipUnless(eetq_available, "test requires eetq")(test_case)
1257
-
1258
-
1259
1263
  def require_av(test_case):
1260
1264
  """
1261
1265
  Decorator marking a test that requires av
@@ -1291,13 +1295,11 @@ def require_tensorboard(test_case):
1291
1295
  return unittest.skipUnless(is_tensorboard_available(), "test requires tensorboard")
1292
1296
 
1293
1297
 
1294
- def require_gptq(test_case):
1298
+ def require_gptqmodel(test_case):
1295
1299
  """
1296
- Decorator for auto_gptq dependency
1300
+ Decorator for gptqmodel dependency
1297
1301
  """
1298
- return unittest.skipUnless(
1299
- is_gptqmodel_available() or is_auto_gptq_available(), "test requires gptqmodel or auto-gptq"
1300
- )(test_case)
1302
+ return unittest.skipUnless(is_gptqmodel_available(), "test requires gptqmodel")(test_case)
1301
1303
 
1302
1304
 
1303
1305
  def require_hqq(test_case):
@@ -1307,13 +1309,6 @@ def require_hqq(test_case):
1307
1309
  return unittest.skipUnless(is_hqq_available(), "test requires hqq")(test_case)
1308
1310
 
1309
1311
 
1310
- def require_auto_awq(test_case):
1311
- """
1312
- Decorator for auto_awq dependency
1313
- """
1314
- return unittest.skipUnless(is_auto_awq_available(), "test requires autoawq")(test_case)
1315
-
1316
-
1317
1312
  def require_auto_round(test_case):
1318
1313
  """
1319
1314
  Decorator for auto_round dependency
@@ -3192,6 +3187,8 @@ def get_device_properties() -> DeviceProperties:
3192
3187
  gen_mask = 0x000000FF00000000
3193
3188
  gen = (arch & gen_mask) >> 32
3194
3189
  return ("xpu", gen, None)
3190
+ elif IS_NPU_SYSTEM:
3191
+ return ("npu", None, None)
3195
3192
  else:
3196
3193
  return (torch_device, None, None)
3197
3194
 
@@ -4092,3 +4089,267 @@ def write_file(file, content):
4092
4089
  def read_json_file(file):
4093
4090
  with open(file, "r") as fh:
4094
4091
  return json.load(fh)
4092
+
4093
+
4094
+ # =============================================================================
4095
+ # Training CI Utilities - Logging and Memory Monitoring
4096
+ # =============================================================================
4097
+
4098
+
4099
+ # ANSI color codes for terminal output
4100
+ class Colors:
4101
+ """ANSI color codes for terminal output formatting."""
4102
+
4103
+ RESET = "\033[0m"
4104
+ BOLD = "\033[1m"
4105
+ DIM = "\033[2m"
4106
+
4107
+ # Foreground colors
4108
+ RED = "\033[31m"
4109
+ GREEN = "\033[32m"
4110
+ YELLOW = "\033[33m"
4111
+ BLUE = "\033[34m"
4112
+ MAGENTA = "\033[35m"
4113
+ CYAN = "\033[36m"
4114
+ WHITE = "\033[37m"
4115
+
4116
+ # Bright variants
4117
+ BRIGHT_RED = "\033[91m"
4118
+ BRIGHT_GREEN = "\033[92m"
4119
+ BRIGHT_YELLOW = "\033[93m"
4120
+ BRIGHT_BLUE = "\033[94m"
4121
+ BRIGHT_CYAN = "\033[96m"
4122
+
4123
+
4124
+ class ColoredFormatter(logging.Formatter):
4125
+ """Custom formatter that adds colors based on log level."""
4126
+
4127
+ LEVEL_COLORS = {
4128
+ logging.DEBUG: Colors.DIM + Colors.CYAN,
4129
+ logging.INFO: Colors.WHITE,
4130
+ logging.WARNING: Colors.BRIGHT_YELLOW,
4131
+ logging.ERROR: Colors.BRIGHT_RED,
4132
+ logging.CRITICAL: Colors.BOLD + Colors.BRIGHT_RED,
4133
+ }
4134
+
4135
+ # Loggers that should be dimmed (less important/verbose)
4136
+ DIMMED_LOGGERS = {"httpx", "httpcore", "urllib3", "requests"}
4137
+
4138
+ def __init__(self, fmt: str | None = None, datefmt: str | None = None):
4139
+ super().__init__(fmt, datefmt)
4140
+
4141
+ def format(self, record: logging.LogRecord) -> str:
4142
+ # Check if this logger should be dimmed
4143
+ is_dimmed = record.name in self.DIMMED_LOGGERS
4144
+
4145
+ if is_dimmed:
4146
+ # Dim the entire log line for httpx and similar
4147
+ timestamp = self.formatTime(record, self.datefmt)
4148
+ message = record.getMessage()
4149
+ return f"{Colors.DIM}{timestamp} - {record.name} - {record.levelname:8} - {message}{Colors.RESET}"
4150
+
4151
+ # Get color for this level
4152
+ color = self.LEVEL_COLORS.get(record.levelno, Colors.RESET)
4153
+
4154
+ # Color the level name
4155
+ levelname = record.levelname
4156
+ colored_levelname = f"{color}{levelname:8}{Colors.RESET}"
4157
+
4158
+ # Color the timestamp
4159
+ colored_time = f"{Colors.DIM}{self.formatTime(record, self.datefmt)}{Colors.RESET}"
4160
+
4161
+ # Color the logger name
4162
+ colored_name = f"{Colors.BLUE}{record.name}{Colors.RESET}"
4163
+
4164
+ # Get message
4165
+ message = record.getMessage()
4166
+
4167
+ return f"{colored_time} - {colored_name} - {colored_levelname} - {message}"
4168
+
4169
+
4170
+ _warn_once_logged: set[str] = set()
4171
+
4172
+
4173
+ def init_test_logger() -> logging.Logger:
4174
+ """Initialize a test-specific logger with colored stderr handler and INFO level for tests.
4175
+
4176
+ Uses a named logger instead of root logger to avoid conflicts with pytest-xdist parallel execution.
4177
+ Uses stderr instead of stdout to avoid deadlocks with pytest-xdist output capture.
4178
+ """
4179
+ logger = logging.getLogger("transformers.training_test")
4180
+ logger.setLevel(logging.INFO)
4181
+
4182
+ # Only add handler if not already present (avoid duplicate handlers on repeated calls)
4183
+ if not logger.handlers:
4184
+ # Use stderr instead of stdout - pytest-xdist captures stdout which can cause deadlocks
4185
+ ch = logging.StreamHandler(sys.stderr)
4186
+ ch.setLevel(logging.INFO)
4187
+
4188
+ # Use colored formatter if terminal supports it, plain otherwise
4189
+ if sys.stderr.isatty():
4190
+ formatter = ColoredFormatter(datefmt="%Y-%m-%d %H:%M:%S")
4191
+ else:
4192
+ formatter = logging.Formatter(
4193
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
4194
+ )
4195
+
4196
+ ch.setFormatter(formatter)
4197
+ logger.addHandler(ch)
4198
+
4199
+ logger.propagate = False # Don't propagate to root logger to avoid duplicate output
4200
+ return logger
4201
+
4202
+
4203
+ def warn_once(logger_instance: logging.Logger, msg: str) -> None:
4204
+ """Log a warning message only once per unique message.
4205
+
4206
+ Uses a global set to track messages that have already been logged
4207
+ to prevent duplicate warning messages from cluttering the output.
4208
+
4209
+ Args:
4210
+ logger_instance: The logger instance to use for warning.
4211
+ msg: The warning message to log.
4212
+ """
4213
+ if msg not in _warn_once_logged:
4214
+ logger_instance.warning(msg)
4215
+ _warn_once_logged.add(msg)
4216
+
4217
+
4218
+ # Named tuple for passing memory stats for logging
4219
+ MemoryStats = collections.namedtuple(
4220
+ "MemoryStats",
4221
+ [
4222
+ "rss_gib", # Resident Set Size in GiB
4223
+ "rss_pct", # RSS as percentage of total memory
4224
+ "vms_gib", # Virtual Memory Size in GiB
4225
+ "peak_rss_gib", # Peak RSS in GiB
4226
+ "peak_rss_pct", # Peak RSS as percentage of total memory
4227
+ "available_gib", # Available system memory in GiB
4228
+ "total_gib", # Total system memory in GiB
4229
+ ],
4230
+ )
4231
+
4232
+
4233
+ class CPUMemoryMonitor:
4234
+ """Monitor CPU memory usage for the current process."""
4235
+
4236
+ def __init__(self):
4237
+ self.device_name = "CPU"
4238
+ self._peak_rss = 0
4239
+ self._process = None
4240
+ self.total_memory = 0
4241
+ self.total_memory_gib = 0
4242
+
4243
+ if is_psutil_available():
4244
+ import psutil
4245
+
4246
+ self._process = psutil.Process(os.getpid())
4247
+ mem_info = psutil.virtual_memory()
4248
+ self.total_memory = mem_info.total
4249
+ self.total_memory_gib = self._to_gib(self.total_memory)
4250
+
4251
+ def _to_gib(self, memory_in_bytes: int) -> float:
4252
+ """Convert bytes to GiB."""
4253
+ return memory_in_bytes / (1024 * 1024 * 1024)
4254
+
4255
+ def _to_pct(self, memory_in_bytes: int) -> float:
4256
+ """Convert bytes to percentage of total memory."""
4257
+ if self.total_memory == 0:
4258
+ return 0.0
4259
+ return 100.0 * memory_in_bytes / self.total_memory
4260
+
4261
+ def _update_peak(self) -> None:
4262
+ """Update peak memory tracking."""
4263
+ if self._process is not None:
4264
+ current_rss = self._process.memory_info().rss
4265
+ self._peak_rss = max(self._peak_rss, current_rss)
4266
+
4267
+ def get_stats(self) -> MemoryStats:
4268
+ """Get current memory statistics."""
4269
+ if not is_psutil_available():
4270
+ return MemoryStats(0, 0, 0, 0, 0, 0, 0)
4271
+
4272
+ import psutil
4273
+
4274
+ self._update_peak()
4275
+
4276
+ mem_info = self._process.memory_info()
4277
+ sys_mem = psutil.virtual_memory()
4278
+
4279
+ return MemoryStats(
4280
+ rss_gib=self._to_gib(mem_info.rss),
4281
+ rss_pct=self._to_pct(mem_info.rss),
4282
+ vms_gib=self._to_gib(mem_info.vms),
4283
+ peak_rss_gib=self._to_gib(self._peak_rss),
4284
+ peak_rss_pct=self._to_pct(self._peak_rss),
4285
+ available_gib=self._to_gib(sys_mem.available),
4286
+ total_gib=self._to_gib(sys_mem.total),
4287
+ )
4288
+
4289
+ def reset_peak_stats(self) -> None:
4290
+ """Reset peak memory tracking."""
4291
+ if self._process is not None:
4292
+ self._peak_rss = self._process.memory_info().rss
4293
+
4294
+
4295
+ def build_cpu_memory_monitor(logger_instance: logging.Logger | None = None) -> CPUMemoryMonitor:
4296
+ """Build and initialize a CPU memory monitor.
4297
+
4298
+ Args:
4299
+ logger_instance: Optional logger to log initialization info. If None, no logging is done.
4300
+
4301
+ Returns:
4302
+ CPUMemoryMonitor instance.
4303
+ """
4304
+ monitor = CPUMemoryMonitor()
4305
+ if logger_instance is not None:
4306
+ if is_psutil_available():
4307
+ logger_instance.info(f"CPU memory monitor initialized: {monitor.total_memory_gib:.2f} GiB total")
4308
+ else:
4309
+ logger_instance.warning("psutil not available, memory monitoring disabled")
4310
+ return monitor
4311
+
4312
+
4313
+ def convert_all_safetensors_to_bins(folder: str):
4314
+ """Convert all safetensors files into torch bin files, to mimic saving with torch (since we still support loading
4315
+ bin files, but not saving them anymore)"""
4316
+ for file in os.listdir(folder):
4317
+ path = os.path.join(folder, file)
4318
+ if file.endswith(".safetensors"):
4319
+ new_path = path.replace(".safetensors", ".bin").replace("model", "pytorch_model")
4320
+ state_dict = load_file(path)
4321
+ os.remove(path)
4322
+ torch.save(state_dict, new_path)
4323
+ # Adapt the index as well
4324
+ elif file == SAFE_WEIGHTS_INDEX_NAME:
4325
+ new_path = os.path.join(folder, WEIGHTS_INDEX_NAME)
4326
+ with open(path) as f:
4327
+ index = json.loads(f.read())
4328
+ os.remove(path)
4329
+ if "weight_map" in index.keys():
4330
+ weight_map = index["weight_map"]
4331
+ new_weight_map = {}
4332
+ for k, v in weight_map.items():
4333
+ new_weight_map[k] = v.replace(".safetensors", ".bin").replace("model", "pytorch_model")
4334
+ index["weight_map"] = new_weight_map
4335
+ with open(new_path, "w") as f:
4336
+ f.write(json.dumps(index, indent=4))
4337
+
4338
+
4339
+ @contextmanager
4340
+ def force_serialization_as_bin_files():
4341
+ """Since we don't support saving with torch `.bin` files anymore, but still support loading them, we use this context
4342
+ to easily create the bin files and try to load them back"""
4343
+ try:
4344
+ # Monkey patch the method to save as bin files
4345
+ original_save = PreTrainedModel.save_pretrained
4346
+
4347
+ def new_save(self, save_directory, *args, **kwargs):
4348
+ original_save(self, save_directory, *args, **kwargs)
4349
+ convert_all_safetensors_to_bins(save_directory)
4350
+
4351
+ PreTrainedModel.save_pretrained = new_save
4352
+
4353
+ yield
4354
+ finally:
4355
+ PreTrainedModel.save_pretrained = original_save
@@ -1986,3 +1986,7 @@ class MistralCommonBackend(PushToHubMixin):
1986
1986
  if mode not in [ValidationMode.finetuning, ValidationMode.test]:
1987
1987
  raise ValueError(_invalid_mode_msg)
1988
1988
  return mode
1989
+
1990
+
1991
+ # Backward compatibility alias for codebases still importing the legacy name.
1992
+ MistralCommonTokenizer = MistralCommonBackend