transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -20,18 +20,19 @@ import copy
20
20
  import json
21
21
  import os
22
22
  from collections import defaultdict
23
+ from collections.abc import Iterable
23
24
  from shutil import copyfile
24
25
  from typing import Any, Optional, Union
25
26
 
26
27
  import tokenizers.pre_tokenizers as pre_tokenizers_fast
28
+ from huggingface_hub import is_offline_mode
27
29
  from tokenizers import AddedToken, processors
28
30
  from tokenizers import Encoding as EncodingFast
29
31
  from tokenizers import Tokenizer as TokenizerFast
30
- from tokenizers import normalizers as tokenizers_normalizers
31
32
  from tokenizers.decoders import Decoder as DecoderFast
33
+ from tokenizers.models import BPE, Unigram
32
34
  from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
33
35
 
34
- from .convert_slow_tokenizer import convert_slow_tokenizer
35
36
  from .integrations.ggml import convert_gguf_tokenizer
36
37
  from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
37
38
  from .tokenization_utils_base import (
@@ -41,8 +42,9 @@ from .tokenization_utils_base import (
41
42
  PreTrainedTokenizerBase,
42
43
  TextInput,
43
44
  TruncationStrategy,
45
+ generate_merges,
44
46
  )
45
- from .utils import PaddingStrategy, add_end_docstrings, is_offline_mode, logging
47
+ from .utils import PaddingStrategy, add_end_docstrings, logging
46
48
 
47
49
 
48
50
  logger = logging.get_logger(__name__)
@@ -90,26 +92,162 @@ class TokenizersBackend(PreTrainedTokenizerBase):
90
92
  """
91
93
 
92
94
  vocab_files_names = VOCAB_FILES_NAMES
95
+ model = None
96
+ _tokenizer = None
97
+
98
+ @classmethod
99
+ def convert_to_native_format(cls, trust_remote_code=False, **kwargs):
100
+ """s
101
+ Build a `tokenizers.Tokenizer` backend from the available serialization files (tokenizer.json, sentencepiece
102
+ models, tekken.json, vocab/merges).
103
+ """
104
+ # Preserve kwargs for possible downstream use
105
+ local_kwargs = dict(kwargs)
106
+ fast_tokenizer_file = local_kwargs.pop("tokenizer_file", None)
107
+
108
+ if (
109
+ fast_tokenizer_file is not None
110
+ and os.path.isfile(fast_tokenizer_file)
111
+ and (cls is TokenizersBackend or "__init__" not in cls.__dict__ or trust_remote_code)
112
+ ):
113
+ local_kwargs["tokenizer_object"] = TokenizerFast.from_file(fast_tokenizer_file)
114
+ return local_kwargs
115
+ elif fast_tokenizer_file is not None and os.path.isfile(fast_tokenizer_file):
116
+ # we extract vocab / merges from the tokenizer file to pass them to __init__
117
+ processor = TokenizerFast.from_file(fast_tokenizer_file).post_processor
118
+ with open(fast_tokenizer_file, encoding="utf-8") as tokenizer_handle:
119
+ tokenizer_json = json.load(tokenizer_handle)
120
+ vocab = tokenizer_json.get("model", {}).get("vocab", None)
121
+ if cls.model is None:
122
+ if isinstance(vocab, list):
123
+ vocab = list(map(tuple, vocab)) # TODO just for now
124
+ elif cls.model.__name__ == "Unigram":
125
+ if vocab and isinstance(vocab[0], (list, tuple)):
126
+ vocab = [tuple(item) for item in vocab]
127
+ elif cls.model.__name__ == "WordLevel":
128
+ vocab = {token: i for i, token in enumerate(vocab)}
129
+ elif cls.model.__name__ == "BPE" or cls.model.__name__ == "WordPiece":
130
+ if isinstance(vocab, list):
131
+ vocab = {token[0] if isinstance(token, list) else token: i for i, token in enumerate(vocab)}
132
+ local_kwargs["vocab"] = vocab
133
+
134
+ model_type = getattr(cls, "model", None)
135
+ if "merges" in tokenizer_json.get("model", {}) and (model_type and model_type.__name__ == "BPE"):
136
+ merges = tokenizer_json["model"]["merges"]
137
+ merges = [tuple(merge.split(" ")) if isinstance(merge, str) else tuple(merge) for merge in merges]
138
+ local_kwargs["merges"] = merges
139
+
140
+ if processor is not None:
141
+ local_kwargs["post_processor"] = processor
142
+ return local_kwargs
143
+
144
+ vocab_file = local_kwargs.get("vocab_file")
145
+ merges_file = local_kwargs.get("merges_file")
146
+ vocab = local_kwargs.get("vocab")
147
+ merges = local_kwargs.get("merges")
148
+
149
+ # Tekken converter (Mistral)
150
+ if isinstance(vocab_file, str) and vocab_file.endswith("tekken.json") and os.path.isfile(vocab_file):
151
+ from .convert_slow_tokenizer import MistralConverter
152
+
153
+ local_kwargs["vocab"], local_kwargs["merges"] = MistralConverter(
154
+ vocab_file=vocab_file
155
+ ).extract_vocab_merges_from_model(vocab_file)
156
+ return local_kwargs
157
+
158
+ # SentencePiece model (with TikToken fallback)
159
+ if isinstance(vocab_file, str) and os.path.isfile(vocab_file) and vocab_file.endswith(".model"):
160
+ try:
161
+ from .convert_slow_tokenizer import SentencePieceExtractor
162
+
163
+ local_kwargs = SentencePieceExtractor(vocab_file).extract(cls.model, **local_kwargs)
164
+ try:
165
+ from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
166
+
167
+ converter_class = SLOW_TO_FAST_CONVERTERS.get(cls.__name__)
168
+ if converter_class is not None and hasattr(converter_class, "convert_from_spm"):
169
+ local_kwargs = converter_class.convert_from_spm(**local_kwargs)
170
+ except Exception as e:
171
+ logger.warning(
172
+ f"Could not reorder vocab using converter for {cls.__name__} due to {e}. Falling back to raw SentencePiece extraction."
173
+ )
174
+ # what used to be in `convert_slow`
175
+ if hasattr(cls, "convert_from_spm_model"):
176
+ local_kwargs = cls.convert_from_spm_model(**local_kwargs)
177
+ except Exception as e: # TODO only catch deserialization error here!
178
+ logger.warning(
179
+ f"Could not extract SentencePiece model from {vocab_file} using sentencepiece library due to {e}. "
180
+ "Falling back to TikToken extractor."
181
+ )
182
+ from .convert_slow_tokenizer import TikTokenConverter
183
+
184
+ local_kwargs["vocab"], local_kwargs["merges"] = TikTokenConverter(
185
+ vocab_file=vocab_file, extra_special_tokens=local_kwargs.get("extra_special_tokens")
186
+ ).extract_vocab_merges_from_model(vocab_file)
187
+
188
+ return local_kwargs
189
+
190
+ # Fallback to standard vocab/merges files if they existed!
191
+ if vocab is None and isinstance(vocab_file, str) and os.path.isfile(vocab_file):
192
+ local_kwargs["vocab"] = vocab_file
193
+ vocab = local_kwargs["vocab"]
194
+ if merges is None and isinstance(merges_file, str) and os.path.isfile(merges_file):
195
+ local_kwargs["merges"] = merges_file
196
+ merges = local_kwargs["merges"]
197
+
198
+ # Generate merges automatically when not provided for BPE tokenizers
199
+ if merges is None and cls.model is not None and cls.model.__name__ == "BPE" and isinstance(vocab, dict):
200
+ # Gather special tokens from kwargs to skip in merge generation
201
+ def _iter_special_tokens(values: Iterable[Any]) -> list[str]:
202
+ collected: list[str] = []
203
+ for val in values:
204
+ if val is None:
205
+ continue
206
+ if isinstance(val, (list, tuple)):
207
+ collected.extend(_iter_special_tokens(val))
208
+ else:
209
+ collected.append(str(val))
210
+ return collected
211
+
212
+ special_tokens_keys = [
213
+ "pad_token",
214
+ "unk_token",
215
+ "bos_token",
216
+ "eos_token",
217
+ "sep_token",
218
+ "cls_token",
219
+ "mask_token",
220
+ "additional_special_tokens",
221
+ "extra_special_tokens",
222
+ ]
223
+ skip_tokens: set[str] = set()
224
+ for key in special_tokens_keys:
225
+ if key in local_kwargs:
226
+ skip_tokens.update(_iter_special_tokens([local_kwargs[key]]))
227
+
228
+ merges = generate_merges(vocab, skip_tokens=skip_tokens)
229
+ local_kwargs["merges"] = merges
230
+ return local_kwargs
93
231
 
94
232
  def __init__(self, *args, **kwargs):
95
233
  tokenizer_object = kwargs.pop("tokenizer_object", None)
96
- slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
97
234
  gguf_file = kwargs.pop("gguf_file", None)
98
235
  fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
99
- from_slow = kwargs.pop("from_slow", False)
100
236
  # Note: added_tokens_decoder is NOT popped - it's passed to super().__init__() for processing
101
237
  added_tokens_decoder = kwargs.get("added_tokens_decoder", {})
102
238
  # Store add_prefix_space before super().__init__() to ensure it's not overridden
103
239
  add_prefix_space = kwargs.get("add_prefix_space", False)
240
+ vocab_file = kwargs.get("vocab_file")
241
+
242
+ vocab = kwargs.get("vocab")
243
+ merges = kwargs.get("merges")
104
244
 
245
+ fast_tokenizer = None
105
246
  if tokenizer_object is not None:
106
247
  fast_tokenizer = copy.deepcopy(tokenizer_object)
107
- elif fast_tokenizer_file is not None and not from_slow:
248
+ elif fast_tokenizer_file is not None and os.path.isfile(fast_tokenizer_file):
108
249
  # We have a serialization from tokenizers which let us directly build the backend
109
250
  fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
110
- elif slow_tokenizer:
111
- # We need to convert a slow tokenizer to build the backend
112
- fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
113
251
  elif gguf_file is not None:
114
252
  # We need to convert a slow tokenizer to build the backend
115
253
  gguf_param = load_gguf_checkpoint(kwargs.get("vocab_file"))
@@ -120,18 +258,16 @@ class TokenizersBackend(PreTrainedTokenizerBase):
120
258
  kwargs.update(tokenizer_config)
121
259
  if len(additional_kwargs) > 0:
122
260
  kwargs.update(additional_kwargs)
123
- elif self.slow_tokenizer_class is not None and slow_tokenizer is not False:
124
- # We need to create and convert a slow tokenizer to build the backend
125
- slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
126
- fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
127
- elif not slow_tokenizer:
128
- # We tried loading a slow_tokenizer with spm and failed, try to load with tiktoken
129
- self.vocab_file = kwargs.get("vocab_file")
130
- # V5: Set _extra_special_tokens directly for converter
131
- self._extra_special_tokens = kwargs.get("extra_special_tokens", [])
132
- fast_tokenizer = convert_slow_tokenizer(self, from_tiktoken=True)
133
- slow_tokenizer = None
134
- else:
261
+ elif self._tokenizer is None and vocab is not None:
262
+ # Build from vocab/merges extracted by convert_to_native_format
263
+ if merges is not None:
264
+ vocab_dict = vocab if isinstance(vocab, dict) else {w: i for i, (w, _) in enumerate(vocab)}
265
+ fast_tokenizer = TokenizerFast(BPE(vocab=vocab_dict, merges=merges, fuse_unk=True, dropout=None))
266
+ elif isinstance(vocab, dict):
267
+ fast_tokenizer = TokenizerFast(BPE(vocab=vocab, merges=[], fuse_unk=True, dropout=None))
268
+ elif isinstance(vocab, list) and vocab and isinstance(vocab[0], (tuple, list)):
269
+ fast_tokenizer = TokenizerFast(Unigram(vocab=vocab, unk_id=kwargs.get("unk_id", 0)))
270
+ elif self._tokenizer is None:
135
271
  raise ValueError(
136
272
  "Couldn't instantiate the backend tokenizer from one of: \n"
137
273
  "(1) a `tokenizers` library serialization file, \n"
@@ -139,11 +275,16 @@ class TokenizersBackend(PreTrainedTokenizerBase):
139
275
  "(3) an equivalent slow tokenizer class to instantiate and convert. \n"
140
276
  "You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one."
141
277
  )
278
+ # Only set defaults when creating TokenizersBackend from scratch
279
+ if fast_tokenizer_file is None and tokenizer_object is None and self._tokenizer is None:
280
+ kwargs.setdefault("bos_token", "<s>")
281
+ kwargs.setdefault("eos_token", "</s>")
142
282
 
143
- self._tokenizer = fast_tokenizer
283
+ if fast_tokenizer is not None:
284
+ self._tokenizer = fast_tokenizer
144
285
 
145
- if slow_tokenizer is not None:
146
- kwargs.update(slow_tokenizer.init_kwargs)
286
+ if self._tokenizer is None:
287
+ raise ValueError("The backend tokenizer is not correctly initialized.")
147
288
 
148
289
  _truncation = self._tokenizer.truncation
149
290
 
@@ -169,8 +310,17 @@ class TokenizersBackend(PreTrainedTokenizerBase):
169
310
  if "backend" not in kwargs:
170
311
  kwargs["backend"] = "tokenizers"
171
312
 
313
+ explicit_bos_eos_in_kwargs = "add_bos_token" in kwargs or "add_eos_token" in kwargs
314
+ self._add_bos_token = kwargs.get("add_bos_token", False)
315
+ self._add_eos_token = kwargs.get("add_eos_token", False)
316
+ if post_processor := kwargs.pop("post_processor", None): # most reliable way to get the post-processor
317
+ self._tokenizer.post_processor = post_processor
318
+ self._should_update_post_processor = explicit_bos_eos_in_kwargs or self._tokenizer.post_processor is None
172
319
  # We call this after having initialized the backend tokenizer because we update it.
173
320
  super().__init__(**kwargs)
321
+
322
+ if vocab_file is not None:
323
+ self.vocab_file = vocab_file
174
324
  # Ensure add_prefix_space is set correctly after parent init
175
325
  self.add_prefix_space = add_prefix_space
176
326
  self._tokenizer.encode_special_tokens = self.split_special_tokens
@@ -210,7 +360,7 @@ class TokenizersBackend(PreTrainedTokenizerBase):
210
360
  tokens.append(token)
211
361
  if tokens:
212
362
  # These tokens are from the special tokens map
213
- self.add_tokens(tokens, special_tokens=True)
363
+ self.add_tokens(tokens)
214
364
 
215
365
  try:
216
366
  vocab_size = self._tokenizer.get_vocab_size()
@@ -228,6 +378,12 @@ class TokenizersBackend(PreTrainedTokenizerBase):
228
378
  **kwargs,
229
379
  )
230
380
 
381
+ self._should_update_post_processor = (
382
+ self._should_update_post_processor or self._tokenizer.post_processor is None
383
+ )
384
+ if self._should_update_post_processor:
385
+ self.update_post_processor()
386
+
231
387
  @property
232
388
  def is_fast(self) -> bool:
233
389
  return True
@@ -273,7 +429,7 @@ class TokenizersBackend(PreTrainedTokenizerBase):
273
429
  # If eos_token is None and add_eos_token is True, silently disable add_eos_token
274
430
  # This allows tokenizers to set add_eos_token even if eos_token is not configured
275
431
  if eos is None and self.add_eos_token:
276
- self._add_eos_token = False
432
+ self.add_eos_token = False
277
433
  return
278
434
 
279
435
  single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
@@ -320,98 +476,24 @@ class TokenizersBackend(PreTrainedTokenizerBase):
320
476
  if token_value is None:
321
477
  continue
322
478
  if isinstance(token_value, AddedToken):
323
- if self._tokenizer.token_to_id(str(token_value)) is None:
324
- tokens_to_add.append(token_value)
479
+ tokens_to_add.append(token_value)
325
480
  elif isinstance(token_value, str):
326
- if self._tokenizer.token_to_id(token_value) is None:
327
- tokens_to_add.append(AddedToken(token_value, special=True, normalized=False))
481
+ tokens_to_add.append(AddedToken(token_value, special=True, normalized=False))
328
482
 
329
483
  # V5: Check extra special tokens
330
484
  for token in self._extra_special_tokens:
331
485
  if isinstance(token, AddedToken):
332
- if self._tokenizer.token_to_id(str(token)) is None:
333
- tokens_to_add.append(token)
486
+ tokens_to_add.append(token)
334
487
  elif isinstance(token, str):
335
- if self._tokenizer.token_to_id(token) is None:
336
- tokens_to_add.append(AddedToken(token, special=True, normalized=False))
488
+ tokens_to_add.append(AddedToken(token, special=True, normalized=False))
337
489
 
338
490
  if tokens_to_add:
339
491
  # Ensure special tokens are added as such to the backend
340
492
  self.add_tokens(tokens_to_add, special_tokens=True)
341
493
 
342
- if hasattr(self, "_add_bos_token") or hasattr(self, "_add_eos_token"):
494
+ if getattr(self, "_should_update_post_processor", True) or self._tokenizer.post_processor is None:
343
495
  self.update_post_processor()
344
496
 
345
- # Update add_prefix_space in the pre_tokenizer if needed
346
- if hasattr(self, "add_prefix_space"):
347
- try:
348
- tokenizer_json = json.loads(self.backend_tokenizer.to_str())
349
- pre_tok = tokenizer_json.get("pre_tokenizer", {})
350
-
351
- # Recursively update add_prefix_space in pretokenizers
352
- def update_add_prefix_space(pretok_dict, value):
353
- updated = False
354
- if pretok_dict.get("type") == "Sequence":
355
- for nested in pretok_dict.get("pretokenizers", []):
356
- updated |= update_add_prefix_space(nested, value)
357
- elif "add_prefix_space" in pretok_dict and pretok_dict["add_prefix_space"] != value:
358
- pretok_dict["add_prefix_space"] = value
359
- updated = True
360
- return updated
361
-
362
- if update_add_prefix_space(pre_tok, self.add_prefix_space):
363
- self._tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json))
364
- except Exception:
365
- pass
366
-
367
- # Ensure normalizer flags (lowercase/accents/chinese chars) reflect tokenizer attributes
368
- try:
369
- normalizer = self.backend_tokenizer.normalizer
370
- if normalizer is not None:
371
- norm_state = json.loads(normalizer.__getstate__())
372
- norm_type = norm_state.get("type")
373
-
374
- desired_lowercase = getattr(self, "do_lower_case", None)
375
- desired_strip_accents = getattr(self, "strip_accents", None)
376
- # Some tokenizers expose keep_accents instead of strip_accents
377
- if desired_strip_accents is None and hasattr(self, "keep_accents") and "strip_accents" in norm_state:
378
- keep_accents_value = getattr(self, "keep_accents")
379
- if keep_accents_value is not None:
380
- desired_strip_accents = not keep_accents_value
381
- desired_handle_chinese = getattr(self, "tokenize_chinese_chars", None)
382
-
383
- updated = False
384
- if (
385
- desired_lowercase is not None
386
- and "lowercase" in norm_state
387
- and norm_state["lowercase"] != desired_lowercase
388
- ):
389
- norm_state["lowercase"] = desired_lowercase
390
- updated = True
391
- if (
392
- desired_strip_accents is not None
393
- and "strip_accents" in norm_state
394
- and norm_state["strip_accents"] != desired_strip_accents
395
- ):
396
- norm_state["strip_accents"] = desired_strip_accents
397
- updated = True
398
- if (
399
- desired_handle_chinese is not None
400
- and "handle_chinese_chars" in norm_state
401
- and norm_state["handle_chinese_chars"] != desired_handle_chinese
402
- ):
403
- norm_state["handle_chinese_chars"] = desired_handle_chinese
404
- updated = True
405
-
406
- if updated and norm_type is not None:
407
- norm_class = getattr(tokenizers_normalizers, norm_type, None)
408
- if norm_class is not None:
409
- norm_state.pop("type", None)
410
- self.backend_tokenizer.normalizer = norm_class(**norm_state)
411
- except Exception:
412
- # Best-effort: do not block initialization on normalizer reconciliation
413
- pass
414
-
415
497
  @property
416
498
  def vocab_size(self) -> int:
417
499
  """
@@ -839,6 +921,8 @@ class TokenizersBackend(PreTrainedTokenizerBase):
839
921
 
840
922
  if isinstance(token_ids, int):
841
923
  token_ids = [token_ids]
924
+ if isinstance(token_ids, dict):
925
+ token_ids = token_ids["input_ids"]
842
926
  return self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
843
927
 
844
928
  def _save_pretrained(
@@ -1132,7 +1216,7 @@ class TokenizersBackend(PreTrainedTokenizerBase):
1132
1216
  ]
1133
1217
  ):
1134
1218
  return tokenizer
1135
- elif transformers_version and version.parse(transformers_version) >= version.parse("4.57.3"):
1219
+ elif transformers_version and version.parse(transformers_version) > version.parse("4.57.3"):
1136
1220
  return tokenizer
1137
1221
 
1138
1222
  mistral_config_detected = True
transformers/trainer.py CHANGED
@@ -642,6 +642,16 @@ class Trainer:
642
642
  "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method."
643
643
  )
644
644
  default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
645
+
646
+ # Add JIT checkpoint callback if enabled
647
+ if self.args.enable_jit_checkpoint:
648
+ from .trainer_jit_checkpoint import JITCheckpointCallback
649
+
650
+ jit_callback = JITCheckpointCallback()
651
+ default_callbacks = default_callbacks + [jit_callback]
652
+ # Set trainer reference for JIT callback after initialization
653
+ jit_callback.set_trainer(self)
654
+
645
655
  callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
646
656
  self.callback_handler = CallbackHandler(
647
657
  callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler
@@ -1661,6 +1671,12 @@ class Trainer:
1661
1671
  optimizer_cls = AdamW8bit
1662
1672
  else:
1663
1673
  raise ValueError("Invalid optimizer")
1674
+ optimizer_kwargs.update(
1675
+ {
1676
+ "block_size": optim_args.get("block_size", 256),
1677
+ "bf16_stochastic_round": strtobool(optim_args.get("bf16_stochastic_round", "False")),
1678
+ }
1679
+ )
1664
1680
  optimizer_kwargs.update(adam_kwargs)
1665
1681
  elif args.optim in [
1666
1682
  OptimizerNames.SCHEDULE_FREE_RADAM,
@@ -2338,6 +2354,9 @@ class Trainer:
2338
2354
 
2339
2355
  if self.is_fsdp_enabled:
2340
2356
  self.model = self.model_wrapped = model
2357
+ # Fix `got mixed torch.Tensor and DTensor` error in model.generate() for FSDP2 with LoRA
2358
+ if hasattr(self.model, "generate"):
2359
+ dist.fsdp.register_fsdp_forward_method(self.model, "generate")
2341
2360
 
2342
2361
  # for the rest of this function `model` is the outside model, whether it was wrapped or not
2343
2362
  if model is not self.model:
@@ -2428,8 +2447,6 @@ class Trainer:
2428
2447
 
2429
2448
  for epoch in range(epochs_trained, num_train_epochs):
2430
2449
  epoch_dataloader = train_dataloader
2431
- if hasattr(epoch_dataloader, "set_epoch"):
2432
- epoch_dataloader.set_epoch(epoch)
2433
2450
 
2434
2451
  steps_in_epoch = (
2435
2452
  len(epoch_dataloader)
@@ -2450,6 +2467,9 @@ class Trainer:
2450
2467
  elif steps_trained_in_current_epoch == 0:
2451
2468
  self._load_rng_state(resume_from_checkpoint)
2452
2469
 
2470
+ if hasattr(epoch_dataloader, "set_epoch"):
2471
+ epoch_dataloader.set_epoch(epoch)
2472
+
2453
2473
  epoch_iterator = iter(epoch_dataloader)
2454
2474
  # We chunkify the epoch iterator into gradient accumulation steps `n` batches
2455
2475
  remainder = steps_in_epoch % args.gradient_accumulation_steps
@@ -2788,7 +2808,7 @@ class Trainer:
2788
2808
  )
2789
2809
  else:
2790
2810
  # We load the model state dict on the CPU to avoid an OOM error.
2791
- if self.args.save_safetensors and os.path.isfile(safe_weights_file):
2811
+ if os.path.isfile(safe_weights_file):
2792
2812
  state_dict = safetensors.torch.load_file(safe_weights_file, device="cpu")
2793
2813
  else:
2794
2814
  check_torch_load_is_safe()
@@ -2828,9 +2848,7 @@ class Trainer:
2828
2848
  logger.warning(f"Could not load adapter model, make sure to have PEFT >= {MIN_PEFT_VERSION} installed")
2829
2849
  else:
2830
2850
  # We load the sharded checkpoint
2831
- load_result = load_sharded_checkpoint(
2832
- model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled(), prefer_safe=self.args.save_safetensors
2833
- )
2851
+ load_result = load_sharded_checkpoint(model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled())
2834
2852
  if not is_sagemaker_mp_enabled():
2835
2853
  self._issue_warnings_after_load(load_result)
2836
2854
 
@@ -2913,7 +2931,7 @@ class Trainer:
2913
2931
  has_been_loaded = False
2914
2932
  else:
2915
2933
  # We load the model state dict on the CPU to avoid an OOM error.
2916
- if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
2934
+ if os.path.isfile(best_safe_model_path):
2917
2935
  state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
2918
2936
  else:
2919
2937
  check_torch_load_is_safe()
@@ -3932,6 +3950,9 @@ class Trainer:
3932
3950
  # Both standard transformer models and Liger-patched models handle shift_labels correctly,
3933
3951
  # so we can directly use the computed loss from the model output.
3934
3952
  # See: https://huggingface.co/docs/accelerate/en/concept_guides/sequence_parallelism
3953
+ if "labels" not in inputs and "shift_labels" in inputs:
3954
+ # DeepSpeed SP Dataloader removes "labels" but we need it, otherwise, we won't compute the loss.
3955
+ inputs["labels"] = inputs["shift_labels"]
3935
3956
  outputs = model(**inputs)
3936
3957
  loss = outputs.loss
3937
3958
 
@@ -4007,7 +4028,16 @@ class Trainer:
4007
4028
  self._save(output_dir, state_dict=state_dict)
4008
4029
  elif self.is_deepspeed_enabled:
4009
4030
  try:
4010
- state_dict = self.accelerator.get_state_dict(self.deepspeed)
4031
+ accept_exclude_frozen_parameters = "exclude_frozen_parameters" in set(
4032
+ inspect.signature(self.model_wrapped.save_checkpoint).parameters.keys()
4033
+ )
4034
+ zero3_sharding = self.deepspeed.config.get("zero_optimization", {}).get("stage", None) == 3
4035
+ if accept_exclude_frozen_parameters and _is_peft_model(self.model) and zero3_sharding:
4036
+ # When using PEFT with DeepSpeed ZeRO Stage 3,
4037
+ # we do not need to load the frozen parameters
4038
+ state_dict = self.deepspeed._zero3_consolidated_16bit_state_dict(exclude_frozen_parameters=True)
4039
+ else:
4040
+ state_dict = self.accelerator.get_state_dict(self.deepspeed)
4011
4041
  if self.args.should_save:
4012
4042
  self._save(output_dir, state_dict=state_dict)
4013
4043
  except ValueError:
@@ -4067,12 +4097,7 @@ class Trainer:
4067
4097
  model = model.module.module
4068
4098
  unwrapped_model = self.accelerator.unwrap_model(model)
4069
4099
  if isinstance(unwrapped_model, supported_classes):
4070
- unwrapped_model.save_pretrained(
4071
- output_dir,
4072
- state_dict=full_state_dict,
4073
- save_function=xm.save,
4074
- safe_serialization=self.args.save_safetensors,
4075
- )
4100
+ unwrapped_model.save_pretrained(output_dir, state_dict=full_state_dict)
4076
4101
  else:
4077
4102
  logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
4078
4103
  xm.save(full_state_dict, os.path.join(output_dir, WEIGHTS_NAME))
@@ -4082,8 +4107,6 @@ class Trainer:
4082
4107
  output_dir,
4083
4108
  is_main_process=self.args.should_save,
4084
4109
  state_dict=xm._maybe_convert_to_cpu(model.state_dict()),
4085
- save_function=xm.save,
4086
- safe_serialization=self.args.save_safetensors,
4087
4110
  )
4088
4111
  else:
4089
4112
  logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
@@ -4093,8 +4116,6 @@ class Trainer:
4093
4116
  model.save_pretrained(
4094
4117
  output_dir,
4095
4118
  is_main_process=self.args.should_save,
4096
- save_function=xm.save,
4097
- safe_serialization=self.args.save_safetensors,
4098
4119
  state_dict=xm._maybe_convert_to_cpu(model.state_dict()),
4099
4120
  )
4100
4121
  if self.processing_class is not None and self.args.should_save:
@@ -4115,20 +4136,15 @@ class Trainer:
4115
4136
 
4116
4137
  if isinstance(self.accelerator.unwrap_model(self.model, keep_torch_compile=False), supported_classes):
4117
4138
  self.accelerator.unwrap_model(self.model, keep_torch_compile=False).save_pretrained(
4118
- output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
4139
+ output_dir, state_dict=state_dict
4119
4140
  )
4120
4141
  else:
4121
4142
  logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
4122
- if self.args.save_safetensors:
4123
- safetensors.torch.save_file(
4124
- state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"}
4125
- )
4126
- else:
4127
- torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
4143
+ safetensors.torch.save_file(
4144
+ state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"}
4145
+ )
4128
4146
  else:
4129
- self.model.save_pretrained(
4130
- output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
4131
- )
4147
+ self.model.save_pretrained(output_dir, state_dict=state_dict)
4132
4148
 
4133
4149
  if self.processing_class is not None:
4134
4150
  self.processing_class.save_pretrained(output_dir)
@@ -4827,6 +4843,7 @@ class Trainer:
4827
4843
  if not self.args.hub_always_push and self.push_in_progress is not None and not self.push_in_progress.is_done():
4828
4844
  return
4829
4845
 
4846
+ self.callback_handler.on_push_begin(self.args, self.state, self.control)
4830
4847
  output_dir = self.args.output_dir
4831
4848
  # To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder
4832
4849
  modeling_files = [CONFIG_NAME, GENERATION_CONFIG_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_NAME]
@@ -4921,6 +4938,8 @@ class Trainer:
4921
4938
  The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the
4922
4939
  progress of the commit if `blocking=True`.
4923
4940
  """
4941
+ self.callback_handler.on_push_begin(self.args, self.state, self.control)
4942
+
4924
4943
  model_name = kwargs.pop("model_name", None)
4925
4944
  if model_name is None and self.args.should_save:
4926
4945
  if self.args.hub_model_id is None:
@@ -5074,14 +5093,14 @@ class Trainer:
5074
5093
  self.is_tp_enabled = False
5075
5094
  if getattr(self.model, "tp_size", None) is not None and self.model.tp_size > 1:
5076
5095
  self.is_tp_enabled = True
5077
- if self.args.parallelism_config is not None:
5078
- if is_accelerate_available("1.10.1"):
5079
- if self.args.parallelism_config is not None:
5096
+ if self.args.parallelism_config is None:
5097
+ if is_accelerate_available("1.12.0"):
5098
+ if self.args.parallelism_config is None:
5080
5099
  from accelerate import ParallelismConfig
5081
5100
 
5082
5101
  args["parallelism_config"] = ParallelismConfig(tp_size=self.model.tp_size)
5083
5102
  else:
5084
- raise ValueError("Requires accelerate>1.10.1 to use Tensor Parallelism.")
5103
+ raise ValueError("Requires accelerate>1.12.0 to use Tensor Parallelism.")
5085
5104
 
5086
5105
  if is_accelerate_available("1.2.0"):
5087
5106
  # it we don't have the correct version, we will rely on env var instead that were set in TrainingArguments
@@ -420,6 +420,11 @@ class TrainerCallback:
420
420
  Event called after a prediction step.
421
421
  """
422
422
 
423
+ def on_push_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
424
+ """
425
+ Event called before pushing the model to the hub, at the beginning of Trainer.push_to_hub and Trainer._push_from_checkpoint.
426
+ """
427
+
423
428
 
424
429
  class CallbackHandler(TrainerCallback):
425
430
  """Internal class that just calls the list of callbacks in order."""
@@ -532,6 +537,9 @@ class CallbackHandler(TrainerCallback):
532
537
  def on_prediction_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
533
538
  return self.call_event("on_prediction_step", args, state, control)
534
539
 
540
+ def on_push_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
541
+ return self.call_event("on_push_begin", args, state, control, **kwargs)
542
+
535
543
  def call_event(self, event, args, state, control, **kwargs):
536
544
  for callback in self.callbacks:
537
545
  result = getattr(callback, event)(