transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (539) hide show
  1. transformers/__init__.py +30 -3
  2. transformers/cli/serve.py +47 -17
  3. transformers/conversion_mapping.py +15 -2
  4. transformers/convert_slow_tokenizer.py +225 -10
  5. transformers/core_model_loading.py +196 -135
  6. transformers/data/data_collator.py +12 -4
  7. transformers/dependency_versions_table.py +1 -2
  8. transformers/dynamic_module_utils.py +1 -2
  9. transformers/feature_extraction_utils.py +1 -2
  10. transformers/file_utils.py +0 -1
  11. transformers/generation/__init__.py +11 -1
  12. transformers/generation/configuration_utils.py +3 -2
  13. transformers/generation/continuous_batching/__init__.py +4 -0
  14. transformers/generation/continuous_batching/continuous_api.py +134 -79
  15. transformers/image_processing_base.py +1 -2
  16. transformers/integrations/__init__.py +4 -2
  17. transformers/integrations/accelerate.py +15 -3
  18. transformers/integrations/aqlm.py +38 -66
  19. transformers/integrations/awq.py +48 -514
  20. transformers/integrations/bitnet.py +45 -100
  21. transformers/integrations/bitsandbytes.py +79 -191
  22. transformers/integrations/deepspeed.py +1 -0
  23. transformers/integrations/eetq.py +84 -79
  24. transformers/integrations/fbgemm_fp8.py +191 -145
  25. transformers/integrations/finegrained_fp8.py +236 -193
  26. transformers/integrations/fp_quant.py +92 -0
  27. transformers/integrations/ggml.py +11 -1
  28. transformers/integrations/higgs.py +40 -62
  29. transformers/integrations/hub_kernels.py +42 -3
  30. transformers/integrations/integration_utils.py +10 -0
  31. transformers/integrations/mxfp4.py +25 -65
  32. transformers/integrations/peft.py +7 -29
  33. transformers/integrations/quanto.py +73 -55
  34. transformers/integrations/quark.py +55 -0
  35. transformers/integrations/spqr.py +44 -90
  36. transformers/integrations/torchao.py +32 -38
  37. transformers/integrations/vptq.py +42 -59
  38. transformers/modelcard.py +1 -2
  39. transformers/modeling_gguf_pytorch_utils.py +8 -0
  40. transformers/modeling_rope_utils.py +30 -6
  41. transformers/modeling_utils.py +116 -112
  42. transformers/models/__init__.py +3 -0
  43. transformers/models/afmoe/modeling_afmoe.py +4 -4
  44. transformers/models/albert/tokenization_albert.py +6 -12
  45. transformers/models/align/modeling_align.py +2 -0
  46. transformers/models/altclip/modeling_altclip.py +4 -0
  47. transformers/models/apertus/modeling_apertus.py +4 -4
  48. transformers/models/arcee/modeling_arcee.py +4 -4
  49. transformers/models/aria/modeling_aria.py +4 -4
  50. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  51. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  52. transformers/models/auto/configuration_auto.py +11 -0
  53. transformers/models/auto/feature_extraction_auto.py +2 -0
  54. transformers/models/auto/image_processing_auto.py +1 -0
  55. transformers/models/auto/modeling_auto.py +6 -0
  56. transformers/models/auto/processing_auto.py +18 -10
  57. transformers/models/auto/tokenization_auto.py +74 -472
  58. transformers/models/autoformer/modeling_autoformer.py +4 -0
  59. transformers/models/bamba/modeling_bamba.py +4 -3
  60. transformers/models/bark/modeling_bark.py +2 -0
  61. transformers/models/bart/modeling_bart.py +7 -0
  62. transformers/models/barthez/tokenization_barthez.py +5 -10
  63. transformers/models/beit/modeling_beit.py +6 -1
  64. transformers/models/bert/tokenization_bert.py +8 -21
  65. transformers/models/big_bird/modeling_big_bird.py +6 -0
  66. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  67. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
  68. transformers/models/biogpt/modeling_biogpt.py +2 -0
  69. transformers/models/biogpt/modular_biogpt.py +2 -0
  70. transformers/models/bit/modeling_bit.py +11 -2
  71. transformers/models/bitnet/modeling_bitnet.py +4 -4
  72. transformers/models/blenderbot/modeling_blenderbot.py +5 -0
  73. transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
  74. transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
  75. transformers/models/blip/modeling_blip_text.py +2 -0
  76. transformers/models/blip_2/modeling_blip_2.py +2 -1
  77. transformers/models/bloom/modeling_bloom.py +4 -0
  78. transformers/models/blt/modeling_blt.py +2 -2
  79. transformers/models/blt/modular_blt.py +2 -2
  80. transformers/models/bridgetower/modeling_bridgetower.py +5 -1
  81. transformers/models/bros/modeling_bros.py +4 -0
  82. transformers/models/camembert/tokenization_camembert.py +8 -12
  83. transformers/models/canine/modeling_canine.py +5 -0
  84. transformers/models/chameleon/modeling_chameleon.py +2 -1
  85. transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
  86. transformers/models/clap/modeling_clap.py +5 -0
  87. transformers/models/clip/tokenization_clip.py +22 -44
  88. transformers/models/clipseg/modeling_clipseg.py +5 -0
  89. transformers/models/clvp/modeling_clvp.py +5 -0
  90. transformers/models/clvp/tokenization_clvp.py +1 -63
  91. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  92. transformers/models/codegen/tokenization_codegen.py +14 -43
  93. transformers/models/cohere/modeling_cohere.py +4 -3
  94. transformers/models/cohere/modular_cohere.py +2 -1
  95. transformers/models/cohere/tokenization_cohere.py +12 -42
  96. transformers/models/cohere2/modeling_cohere2.py +7 -6
  97. transformers/models/cohere2/modular_cohere2.py +5 -5
  98. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
  99. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  100. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  101. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  102. transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
  103. transformers/models/convbert/modeling_convbert.py +6 -0
  104. transformers/models/convnext/modeling_convnext.py +2 -4
  105. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  106. transformers/models/csm/modeling_csm.py +4 -3
  107. transformers/models/ctrl/modeling_ctrl.py +1 -0
  108. transformers/models/cvt/modeling_cvt.py +2 -0
  109. transformers/models/cwm/modeling_cwm.py +4 -4
  110. transformers/models/d_fine/modeling_d_fine.py +2 -0
  111. transformers/models/d_fine/modular_d_fine.py +1 -0
  112. transformers/models/dab_detr/modeling_dab_detr.py +4 -0
  113. transformers/models/dac/modeling_dac.py +2 -2
  114. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  115. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  116. transformers/models/dbrx/modeling_dbrx.py +2 -2
  117. transformers/models/deberta/modeling_deberta.py +5 -0
  118. transformers/models/deberta/tokenization_deberta.py +11 -20
  119. transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
  120. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  121. transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
  122. transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
  123. transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
  124. transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
  125. transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
  126. transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
  127. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  128. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  129. transformers/models/detr/modeling_detr.py +5 -0
  130. transformers/models/dia/modeling_dia.py +4 -3
  131. transformers/models/dia/modular_dia.py +0 -1
  132. transformers/models/diffllama/modeling_diffllama.py +2 -2
  133. transformers/models/dinat/modeling_dinat.py +3 -0
  134. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  135. transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
  136. transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
  137. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  138. transformers/models/doge/modeling_doge.py +2 -3
  139. transformers/models/doge/modular_doge.py +0 -1
  140. transformers/models/donut/modeling_donut_swin.py +2 -0
  141. transformers/models/dots1/modeling_dots1.py +10 -7
  142. transformers/models/dots1/modular_dots1.py +5 -3
  143. transformers/models/dpr/modeling_dpr.py +5 -0
  144. transformers/models/dpr/tokenization_dpr.py +12 -0
  145. transformers/models/edgetam/modeling_edgetam.py +1 -1
  146. transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
  147. transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
  148. transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
  149. transformers/models/efficientnet/modeling_efficientnet.py +2 -0
  150. transformers/models/emu3/modeling_emu3.py +4 -4
  151. transformers/models/eomt/image_processing_eomt.py +13 -1
  152. transformers/models/eomt/image_processing_eomt_fast.py +14 -2
  153. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  154. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  155. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
  156. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
  157. transformers/models/esm/modeling_esmfold.py +5 -4
  158. transformers/models/evolla/modeling_evolla.py +4 -4
  159. transformers/models/exaone4/modeling_exaone4.py +2 -2
  160. transformers/models/exaone4/modular_exaone4.py +0 -1
  161. transformers/models/falcon/modeling_falcon.py +6 -1
  162. transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
  163. transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
  164. transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
  165. transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
  166. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  167. transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
  168. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  169. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
  170. transformers/models/flaubert/modeling_flaubert.py +7 -0
  171. transformers/models/flava/modeling_flava.py +6 -1
  172. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
  173. transformers/models/florence2/modeling_florence2.py +2 -1
  174. transformers/models/florence2/modular_florence2.py +2 -1
  175. transformers/models/fnet/modeling_fnet.py +7 -0
  176. transformers/models/focalnet/modeling_focalnet.py +4 -0
  177. transformers/models/fsmt/modeling_fsmt.py +2 -0
  178. transformers/models/funnel/modeling_funnel.py +8 -0
  179. transformers/models/funnel/tokenization_funnel.py +17 -24
  180. transformers/models/fuyu/processing_fuyu.py +3 -3
  181. transformers/models/gemma/modeling_gemma.py +4 -4
  182. transformers/models/gemma/tokenization_gemma.py +10 -27
  183. transformers/models/gemma2/modeling_gemma2.py +4 -4
  184. transformers/models/gemma2/modular_gemma2.py +2 -1
  185. transformers/models/gemma3/modeling_gemma3.py +14 -84
  186. transformers/models/gemma3/modular_gemma3.py +12 -81
  187. transformers/models/gemma3n/modeling_gemma3n.py +18 -209
  188. transformers/models/gemma3n/modular_gemma3n.py +17 -59
  189. transformers/models/git/modeling_git.py +2 -0
  190. transformers/models/glm/modeling_glm.py +4 -4
  191. transformers/models/glm4/modeling_glm4.py +4 -4
  192. transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
  193. transformers/models/glm4v/configuration_glm4v.py +3 -1
  194. transformers/models/glm4v/modeling_glm4v.py +3 -3
  195. transformers/models/glm4v/modular_glm4v.py +6 -4
  196. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  197. transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
  198. transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
  199. transformers/models/glpn/modeling_glpn.py +2 -0
  200. transformers/models/gpt2/modeling_gpt2.py +5 -1
  201. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  202. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
  203. transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
  204. transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
  205. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  206. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  207. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
  208. transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
  209. transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
  210. transformers/models/gptj/modeling_gptj.py +3 -0
  211. transformers/models/granite/modeling_granite.py +4 -4
  212. transformers/models/granitemoe/modeling_granitemoe.py +4 -6
  213. transformers/models/granitemoe/modular_granitemoe.py +0 -2
  214. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
  215. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
  216. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
  217. transformers/models/groupvit/modeling_groupvit.py +3 -0
  218. transformers/models/helium/modeling_helium.py +4 -3
  219. transformers/models/herbert/tokenization_herbert.py +9 -25
  220. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
  221. transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
  222. transformers/models/hiera/modeling_hiera.py +4 -0
  223. transformers/models/hubert/modeling_hubert.py +3 -0
  224. transformers/models/hubert/modular_hubert.py +1 -0
  225. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
  226. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
  227. transformers/models/ibert/modeling_ibert.py +6 -0
  228. transformers/models/idefics/modeling_idefics.py +5 -21
  229. transformers/models/imagegpt/modeling_imagegpt.py +2 -1
  230. transformers/models/informer/modeling_informer.py +4 -0
  231. transformers/models/informer/modular_informer.py +1 -0
  232. transformers/models/internvl/modeling_internvl.py +2 -4
  233. transformers/models/internvl/modular_internvl.py +2 -4
  234. transformers/models/jamba/modeling_jamba.py +2 -2
  235. transformers/models/janus/modeling_janus.py +1 -0
  236. transformers/models/janus/modular_janus.py +1 -0
  237. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  238. transformers/models/kosmos2/modeling_kosmos2.py +1 -0
  239. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
  240. transformers/models/lasr/__init__.py +29 -0
  241. transformers/models/lasr/configuration_lasr.py +244 -0
  242. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  243. transformers/models/lasr/modeling_lasr.py +729 -0
  244. transformers/models/lasr/modular_lasr.py +569 -0
  245. transformers/models/lasr/processing_lasr.py +96 -0
  246. transformers/models/lasr/tokenization_lasr.py +186 -0
  247. transformers/models/layoutlm/modeling_layoutlm.py +5 -0
  248. transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
  249. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
  250. transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
  251. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  252. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  253. transformers/models/led/modeling_led.py +6 -0
  254. transformers/models/levit/modeling_levit.py +3 -0
  255. transformers/models/lfm2/modeling_lfm2.py +4 -5
  256. transformers/models/lfm2/modular_lfm2.py +0 -1
  257. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
  258. transformers/models/lightglue/modeling_lightglue.py +3 -1
  259. transformers/models/lightglue/modular_lightglue.py +1 -0
  260. transformers/models/lilt/modeling_lilt.py +4 -0
  261. transformers/models/llama/modeling_llama.py +4 -4
  262. transformers/models/llama/tokenization_llama.py +15 -43
  263. transformers/models/llama4/modeling_llama4.py +3 -2
  264. transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
  265. transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
  266. transformers/models/longformer/modeling_longformer.py +6 -0
  267. transformers/models/longt5/modeling_longt5.py +4 -0
  268. transformers/models/luke/modeling_luke.py +9 -0
  269. transformers/models/luke/tokenization_luke.py +11 -38
  270. transformers/models/lxmert/modeling_lxmert.py +2 -0
  271. transformers/models/m2m_100/modeling_m2m_100.py +4 -0
  272. transformers/models/mamba/modeling_mamba.py +14 -22
  273. transformers/models/marian/modeling_marian.py +5 -0
  274. transformers/models/markuplm/modeling_markuplm.py +4 -0
  275. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  276. transformers/models/mask2former/modeling_mask2former.py +2 -0
  277. transformers/models/maskformer/modeling_maskformer.py +2 -0
  278. transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
  279. transformers/models/mbart/modeling_mbart.py +7 -0
  280. transformers/models/mbart/tokenization_mbart.py +11 -52
  281. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  282. transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
  283. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  284. transformers/models/mimi/modeling_mimi.py +3 -1
  285. transformers/models/minimax/modeling_minimax.py +4 -4
  286. transformers/models/ministral/modeling_ministral.py +4 -4
  287. transformers/models/ministral3/configuration_ministral3.py +1 -1
  288. transformers/models/ministral3/modeling_ministral3.py +4 -3
  289. transformers/models/mistral/modeling_mistral.py +4 -3
  290. transformers/models/mixtral/modeling_mixtral.py +4 -4
  291. transformers/models/mllama/modeling_mllama.py +2 -2
  292. transformers/models/mluke/tokenization_mluke.py +6 -6
  293. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
  294. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  295. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  296. transformers/models/mobilevit/modeling_mobilevit.py +3 -0
  297. transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
  298. transformers/models/modernbert/modeling_modernbert.py +4 -1
  299. transformers/models/modernbert/modular_modernbert.py +2 -0
  300. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
  301. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
  302. transformers/models/moonshine/modeling_moonshine.py +4 -2
  303. transformers/models/moshi/modeling_moshi.py +5 -2
  304. transformers/models/mpnet/modeling_mpnet.py +5 -0
  305. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  306. transformers/models/mpt/modeling_mpt.py +2 -0
  307. transformers/models/mra/modeling_mra.py +6 -0
  308. transformers/models/mt5/modeling_mt5.py +7 -0
  309. transformers/models/musicgen/modeling_musicgen.py +2 -0
  310. transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
  311. transformers/models/mvp/modeling_mvp.py +7 -0
  312. transformers/models/nanochat/modeling_nanochat.py +4 -4
  313. transformers/models/nemotron/modeling_nemotron.py +4 -2
  314. transformers/models/nllb/tokenization_nllb.py +8 -22
  315. transformers/models/nougat/tokenization_nougat.py +11 -59
  316. transformers/models/nystromformer/modeling_nystromformer.py +6 -0
  317. transformers/models/olmo/modeling_olmo.py +4 -4
  318. transformers/models/olmo/modular_olmo.py +2 -2
  319. transformers/models/olmo2/modeling_olmo2.py +4 -5
  320. transformers/models/olmo2/modular_olmo2.py +0 -1
  321. transformers/models/olmo3/modeling_olmo3.py +4 -4
  322. transformers/models/olmoe/modeling_olmoe.py +4 -4
  323. transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
  324. transformers/models/oneformer/modeling_oneformer.py +4 -1
  325. transformers/models/openai/modeling_openai.py +3 -0
  326. transformers/models/openai/tokenization_openai.py +10 -46
  327. transformers/models/opt/modeling_opt.py +2 -0
  328. transformers/models/owlv2/modeling_owlv2.py +4 -0
  329. transformers/models/owlvit/modeling_owlvit.py +4 -0
  330. transformers/models/paddleocr_vl/__init__.py +32 -0
  331. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  332. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
  333. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  334. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
  335. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
  336. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  337. transformers/models/parakeet/configuration_parakeet.py +4 -6
  338. transformers/models/parakeet/modeling_parakeet.py +9 -6
  339. transformers/models/parakeet/modular_parakeet.py +2 -2
  340. transformers/models/parakeet/processing_parakeet.py +1 -0
  341. transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
  342. transformers/models/patchtst/modeling_patchtst.py +20 -2
  343. transformers/models/pegasus/modeling_pegasus.py +5 -0
  344. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  345. transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
  346. transformers/models/perceiver/modeling_perceiver.py +8 -0
  347. transformers/models/persimmon/modeling_persimmon.py +2 -1
  348. transformers/models/phi/modeling_phi.py +4 -5
  349. transformers/models/phi/modular_phi.py +0 -1
  350. transformers/models/phi3/modeling_phi3.py +2 -1
  351. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
  352. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
  353. transformers/models/phimoe/modeling_phimoe.py +4 -4
  354. transformers/models/phimoe/modular_phimoe.py +2 -2
  355. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  356. transformers/models/pixtral/modeling_pixtral.py +2 -1
  357. transformers/models/plbart/modeling_plbart.py +6 -0
  358. transformers/models/plbart/modular_plbart.py +2 -0
  359. transformers/models/plbart/tokenization_plbart.py +0 -2
  360. transformers/models/poolformer/modeling_poolformer.py +2 -0
  361. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  362. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  363. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  364. transformers/models/prophetnet/modeling_prophetnet.py +3 -0
  365. transformers/models/pvt/modeling_pvt.py +2 -0
  366. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  367. transformers/models/qwen2/modeling_qwen2.py +4 -4
  368. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  369. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  370. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
  371. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
  372. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  373. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
  374. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
  375. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
  376. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  377. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  378. transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
  379. transformers/models/qwen3/modeling_qwen3.py +4 -4
  380. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  381. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
  382. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
  383. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
  384. transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
  385. transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
  386. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
  387. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
  388. transformers/models/rag/modeling_rag.py +1 -0
  389. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
  390. transformers/models/reformer/modeling_reformer.py +4 -0
  391. transformers/models/reformer/tokenization_reformer.py +11 -28
  392. transformers/models/regnet/modeling_regnet.py +6 -1
  393. transformers/models/rembert/modeling_rembert.py +6 -0
  394. transformers/models/rembert/tokenization_rembert.py +3 -10
  395. transformers/models/resnet/modeling_resnet.py +11 -2
  396. transformers/models/roberta/tokenization_roberta.py +18 -27
  397. transformers/models/roformer/modeling_roformer.py +6 -0
  398. transformers/models/roformer/tokenization_roformer.py +77 -412
  399. transformers/models/rt_detr/modeling_rt_detr.py +2 -0
  400. transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
  401. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
  402. transformers/models/rwkv/modeling_rwkv.py +1 -0
  403. transformers/models/sam2/modeling_sam2.py +2 -2
  404. transformers/models/sam2/modular_sam2.py +2 -2
  405. transformers/models/sam2_video/modeling_sam2_video.py +1 -0
  406. transformers/models/sam2_video/modular_sam2_video.py +1 -0
  407. transformers/models/sam3/modeling_sam3.py +77 -80
  408. transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
  409. transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
  410. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
  411. transformers/models/sam3_video/modeling_sam3_video.py +1 -0
  412. transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
  413. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  414. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
  415. transformers/models/seed_oss/modeling_seed_oss.py +2 -2
  416. transformers/models/segformer/modeling_segformer.py +4 -1
  417. transformers/models/seggpt/modeling_seggpt.py +2 -0
  418. transformers/models/sew/modeling_sew.py +3 -0
  419. transformers/models/sew/modular_sew.py +1 -0
  420. transformers/models/sew_d/modeling_sew_d.py +3 -0
  421. transformers/models/siglip2/modeling_siglip2.py +4 -0
  422. transformers/models/siglip2/modular_siglip2.py +4 -0
  423. transformers/models/smollm3/modeling_smollm3.py +4 -4
  424. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  425. transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
  426. transformers/models/speecht5/modeling_speecht5.py +13 -1
  427. transformers/models/splinter/modeling_splinter.py +3 -0
  428. transformers/models/splinter/tokenization_splinter.py +9 -28
  429. transformers/models/squeezebert/modeling_squeezebert.py +6 -0
  430. transformers/models/stablelm/modeling_stablelm.py +3 -1
  431. transformers/models/starcoder2/modeling_starcoder2.py +4 -3
  432. transformers/models/superglue/modeling_superglue.py +1 -0
  433. transformers/models/superpoint/modeling_superpoint.py +1 -0
  434. transformers/models/swiftformer/modeling_swiftformer.py +2 -0
  435. transformers/models/swin/modeling_swin.py +4 -0
  436. transformers/models/swin2sr/modeling_swin2sr.py +2 -0
  437. transformers/models/swinv2/modeling_swinv2.py +4 -0
  438. transformers/models/t5/modeling_t5.py +7 -0
  439. transformers/models/t5/tokenization_t5.py +4 -8
  440. transformers/models/t5gemma/modeling_t5gemma.py +5 -5
  441. transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
  442. transformers/models/table_transformer/modeling_table_transformer.py +4 -0
  443. transformers/models/tapas/modeling_tapas.py +3 -0
  444. transformers/models/textnet/modeling_textnet.py +11 -2
  445. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  446. transformers/models/timesfm/modeling_timesfm.py +2 -0
  447. transformers/models/timesfm/modular_timesfm.py +2 -0
  448. transformers/models/timesformer/modeling_timesformer.py +2 -0
  449. transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
  450. transformers/models/trocr/modeling_trocr.py +2 -0
  451. transformers/models/tvp/modeling_tvp.py +2 -0
  452. transformers/models/udop/modeling_udop.py +4 -0
  453. transformers/models/udop/tokenization_udop.py +5 -13
  454. transformers/models/umt5/modeling_umt5.py +7 -0
  455. transformers/models/unispeech/modeling_unispeech.py +4 -0
  456. transformers/models/unispeech/modular_unispeech.py +2 -0
  457. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  458. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  459. transformers/models/univnet/modeling_univnet.py +1 -0
  460. transformers/models/upernet/modeling_upernet.py +1 -0
  461. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  462. transformers/models/vilt/modeling_vilt.py +6 -0
  463. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  464. transformers/models/visual_bert/modeling_visual_bert.py +6 -0
  465. transformers/models/vitdet/modeling_vitdet.py +2 -0
  466. transformers/models/vitmatte/modeling_vitmatte.py +1 -0
  467. transformers/models/vits/modeling_vits.py +1 -0
  468. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  469. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  470. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
  471. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
  472. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
  473. transformers/models/wavlm/modeling_wavlm.py +5 -0
  474. transformers/models/whisper/modeling_whisper.py +6 -0
  475. transformers/models/whisper/tokenization_whisper.py +4 -15
  476. transformers/models/x_clip/modeling_x_clip.py +3 -0
  477. transformers/models/xglm/modeling_xglm.py +1 -0
  478. transformers/models/xglm/tokenization_xglm.py +4 -9
  479. transformers/models/xlm/modeling_xlm.py +5 -0
  480. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  481. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  482. transformers/models/yoso/modeling_yoso.py +6 -0
  483. transformers/models/zamba/modeling_zamba.py +2 -0
  484. transformers/models/zamba2/modeling_zamba2.py +4 -2
  485. transformers/models/zamba2/modular_zamba2.py +1 -1
  486. transformers/models/zoedepth/modeling_zoedepth.py +1 -0
  487. transformers/pipelines/__init__.py +2 -3
  488. transformers/pipelines/base.py +1 -9
  489. transformers/pipelines/document_question_answering.py +3 -1
  490. transformers/pipelines/text_generation.py +1 -1
  491. transformers/processing_utils.py +23 -11
  492. transformers/quantizers/base.py +35 -110
  493. transformers/quantizers/quantizer_aqlm.py +1 -5
  494. transformers/quantizers/quantizer_auto_round.py +1 -2
  495. transformers/quantizers/quantizer_awq.py +17 -81
  496. transformers/quantizers/quantizer_bitnet.py +3 -8
  497. transformers/quantizers/quantizer_bnb_4bit.py +13 -110
  498. transformers/quantizers/quantizer_bnb_8bit.py +16 -92
  499. transformers/quantizers/quantizer_compressed_tensors.py +1 -5
  500. transformers/quantizers/quantizer_eetq.py +14 -62
  501. transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
  502. transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
  503. transformers/quantizers/quantizer_fp_quant.py +48 -78
  504. transformers/quantizers/quantizer_gptq.py +7 -24
  505. transformers/quantizers/quantizer_higgs.py +40 -54
  506. transformers/quantizers/quantizer_hqq.py +144 -153
  507. transformers/quantizers/quantizer_mxfp4.py +13 -167
  508. transformers/quantizers/quantizer_quanto.py +20 -64
  509. transformers/quantizers/quantizer_quark.py +36 -17
  510. transformers/quantizers/quantizer_spqr.py +1 -4
  511. transformers/quantizers/quantizer_torchao.py +23 -202
  512. transformers/quantizers/quantizer_vptq.py +8 -22
  513. transformers/quantizers/quantizers_utils.py +20 -0
  514. transformers/testing_utils.py +297 -36
  515. transformers/tokenization_mistral_common.py +4 -0
  516. transformers/tokenization_utils_base.py +113 -222
  517. transformers/tokenization_utils_tokenizers.py +168 -107
  518. transformers/trainer.py +28 -31
  519. transformers/trainer_jit_checkpoint.py +126 -0
  520. transformers/trainer_utils.py +1 -1
  521. transformers/training_args.py +66 -28
  522. transformers/utils/__init__.py +3 -4
  523. transformers/utils/auto_docstring.py +1 -0
  524. transformers/utils/generic.py +27 -1
  525. transformers/utils/hub.py +5 -15
  526. transformers/utils/import_utils.py +61 -16
  527. transformers/utils/kernel_config.py +4 -2
  528. transformers/utils/loading_report.py +19 -10
  529. transformers/utils/quantization_config.py +75 -242
  530. transformers/video_processing_utils.py +1 -2
  531. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
  532. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
  533. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
  534. transformers/kernels/__init__.py +0 -0
  535. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  536. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  537. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
  538. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
  539. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,6 @@
15
15
  """Auto Tokenizer class."""
16
16
 
17
17
  import importlib
18
- import inspect
19
18
  import json
20
19
  import os
21
20
  from collections import OrderedDict
@@ -26,8 +25,7 @@ from transformers.utils.import_utils import is_mistral_common_available
26
25
  from ...configuration_utils import PreTrainedConfig
27
26
  from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
28
27
  from ...modeling_gguf_pytorch_utils import load_gguf_checkpoint
29
- from ...tokenization_python import PreTrainedTokenizer, PythonBackend
30
- from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE, find_sentencepiece_model_file, load_vocab_and_merges
28
+ from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
31
29
  from ...utils import (
32
30
  extract_commit_hash,
33
31
  is_g2p_en_available,
@@ -35,7 +33,7 @@ from ...utils import (
35
33
  is_tokenizers_available,
36
34
  logging,
37
35
  )
38
- from ...utils.hub import cached_file, has_file
36
+ from ...utils.hub import cached_file
39
37
  from ..encoder_decoder import EncoderDecoderConfig
40
38
  from .auto_factory import _LazyAutoMapping
41
39
  from .configuration_auto import (
@@ -68,8 +66,8 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
68
66
  ("aimv2", "CLIPTokenizerFast" if is_tokenizers_available() else None),
69
67
  ("albert", "AlbertTokenizer" if is_tokenizers_available() else None),
70
68
  ("align", "BertTokenizer" if is_tokenizers_available() else None),
71
- ("arcee", "LlamaTokenizerFast" if is_tokenizers_available() else None),
72
- ("aria", "LlamaTokenizerFast" if is_tokenizers_available() else None),
69
+ ("arcee", "LlamaTokenizer" if is_tokenizers_available() else None),
70
+ ("aria", "LlamaTokenizer" if is_tokenizers_available() else None),
73
71
  ("aya_vision", "CohereTokenizer" if is_tokenizers_available() else None),
74
72
  ("bark", "BertTokenizer" if is_tokenizers_available() else None),
75
73
  ("bart", "RobertaTokenizer" if is_tokenizers_available() else None),
@@ -82,19 +80,19 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
82
80
  ("big_bird", "BigBirdTokenizer" if is_tokenizers_available() else None),
83
81
  ("bigbird_pegasus", "PegasusTokenizer" if is_tokenizers_available() else None),
84
82
  ("biogpt", "BioGptTokenizer"),
85
- ("bitnet", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
83
+ ("bitnet", "TokenizersBackend" if is_tokenizers_available() else None),
86
84
  ("blenderbot", "BlenderbotTokenizer" if is_tokenizers_available() else None),
87
85
  ("blenderbot-small", "BlenderbotSmallTokenizer"),
88
86
  ("blip", "BertTokenizer" if is_tokenizers_available() else None),
89
87
  ("blip-2", "GPT2Tokenizer" if is_tokenizers_available() else None),
90
88
  ("bloom", "TokenizersBackend" if is_tokenizers_available() else None),
91
- ("blt", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
89
+ ("blt", "TokenizersBackend" if is_tokenizers_available() else None),
92
90
  ("bridgetower", "RobertaTokenizer"),
93
91
  ("bros", "BertTokenizer" if is_tokenizers_available() else None),
94
92
  ("byt5", "ByT5Tokenizer"),
95
93
  ("camembert", "CamembertTokenizer" if is_tokenizers_available() else None),
96
94
  ("canine", "CanineTokenizer"),
97
- ("chameleon", "LlamaTokenizerFast" if is_tokenizers_available() else None),
95
+ ("chameleon", "LlamaTokenizer" if is_tokenizers_available() else None),
98
96
  ("chinese_clip", "BertTokenizer" if is_tokenizers_available() else None),
99
97
  ("clap", "RobertaTokenizer"),
100
98
  ("clip", "CLIPTokenizer" if is_tokenizers_available() else None),
@@ -104,34 +102,34 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
104
102
  ("codegen", "GPT2Tokenizer" if is_tokenizers_available() else None),
105
103
  ("cohere", "CohereTokenizer" if is_tokenizers_available() else None),
106
104
  ("cohere2", "CohereTokenizer" if is_tokenizers_available() else None),
107
- ("colpali", "LlamaTokenizerFast" if is_tokenizers_available() else None),
105
+ ("colpali", "LlamaTokenizer" if is_tokenizers_available() else None),
108
106
  ("colqwen2", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
109
107
  ("convbert", "BertTokenizer" if is_tokenizers_available() else None),
110
108
  ("cpm", "CpmTokenizer" if is_tokenizers_available() else None),
111
109
  ("cpmant", "CpmAntTokenizer"),
112
- ("csm", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
110
+ ("csm", "TokenizersBackend" if is_tokenizers_available() else None),
113
111
  ("ctrl", "CTRLTokenizer"),
114
112
  ("data2vec-audio", "Wav2Vec2CTCTokenizer"),
115
113
  ("data2vec-text", "RobertaTokenizer"),
116
114
  ("dbrx", "GPT2Tokenizer" if is_tokenizers_available() else None),
117
115
  ("deberta", "DebertaTokenizer" if is_tokenizers_available() else None),
118
116
  ("deberta-v2", "DebertaV2Tokenizer" if is_tokenizers_available() else None),
119
- ("deepseek_v2", "LlamaTokenizerFast" if is_tokenizers_available() else None),
120
- ("deepseek_v3", "LlamaTokenizerFast" if is_tokenizers_available() else None),
121
- ("deepseek_vl", "LlamaTokenizerFast" if is_tokenizers_available() else None),
122
- ("deepseek_vl_hybrid", "LlamaTokenizerFast" if is_tokenizers_available() else None),
117
+ ("deepseek_v2", "LlamaTokenizer" if is_tokenizers_available() else None),
118
+ ("deepseek_v3", "LlamaTokenizer" if is_tokenizers_available() else None),
119
+ ("deepseek_vl", "LlamaTokenizer" if is_tokenizers_available() else None),
120
+ ("deepseek_vl_hybrid", "LlamaTokenizer" if is_tokenizers_available() else None),
123
121
  ("dia", "DiaTokenizer"),
124
- ("diffllama", "LlamaTokenizerFast" if is_tokenizers_available() else None),
122
+ ("diffllama", "LlamaTokenizer" if is_tokenizers_available() else None),
125
123
  ("distilbert", "BertTokenizer" if is_tokenizers_available() else None),
126
124
  ("dpr", "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None),
127
125
  ("electra", "BertTokenizer" if is_tokenizers_available() else None),
128
126
  ("emu3", "GPT2Tokenizer" if is_tokenizers_available() else None),
129
127
  ("ernie", "BertTokenizer" if is_tokenizers_available() else None),
130
- ("ernie4_5", "LlamaTokenizerFast" if is_tokenizers_available() else None),
131
- ("ernie4_5_moe", "LlamaTokenizerFast" if is_tokenizers_available() else None),
128
+ ("ernie4_5", "LlamaTokenizer" if is_tokenizers_available() else None),
129
+ ("ernie4_5_moe", "LlamaTokenizer" if is_tokenizers_available() else None),
132
130
  ("esm", "EsmTokenizer"),
133
131
  ("exaone4", "GPT2Tokenizer" if is_tokenizers_available() else None),
134
- ("falcon", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
132
+ ("falcon", "TokenizersBackend" if is_tokenizers_available() else None),
135
133
  ("falcon_mamba", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None),
136
134
  ("fastspeech2_conformer", "FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None),
137
135
  ("flaubert", "FlaubertTokenizer"),
@@ -141,6 +139,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
141
139
  ("fnet", "FNetTokenizerFast" if is_tokenizers_available() else None),
142
140
  ("fsmt", "FSMTTokenizer"),
143
141
  ("funnel", "FunnelTokenizer" if is_tokenizers_available() else None),
142
+ ("fuyu", "TokenizersBackend" if is_tokenizers_available() else None),
144
143
  ("gemma", "GemmaTokenizerFast" if is_tokenizers_available() else None),
145
144
  ("gemma2", "GemmaTokenizerFast" if is_tokenizers_available() else None),
146
145
  ("gemma3", "GemmaTokenizerFast" if is_tokenizers_available() else None),
@@ -148,19 +147,19 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
148
147
  ("gemma3n", "GemmaTokenizerFast" if is_tokenizers_available() else None),
149
148
  ("gemma3n_text", "GemmaTokenizerFast" if is_tokenizers_available() else None),
150
149
  ("git", "BertTokenizer" if is_tokenizers_available() else None),
151
- ("glm", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
152
- ("glm4", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
153
- ("glm4_moe", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
154
- ("glm4v", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
155
- ("glm4v_moe", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
156
- ("got_ocr2", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
150
+ ("glm", "TokenizersBackend" if is_tokenizers_available() else None),
151
+ ("glm4", "TokenizersBackend" if is_tokenizers_available() else None),
152
+ ("glm4_moe", "TokenizersBackend" if is_tokenizers_available() else None),
153
+ ("glm4v", "TokenizersBackend" if is_tokenizers_available() else None),
154
+ ("glm4v_moe", "TokenizersBackend" if is_tokenizers_available() else None),
155
+ ("got_ocr2", "TokenizersBackend" if is_tokenizers_available() else None),
157
156
  ("gpt-sw3", "GPTSw3Tokenizer" if is_sentencepiece_available() else None),
158
157
  ("gpt2", "GPT2Tokenizer" if is_tokenizers_available() else None),
159
158
  ("gpt_bigcode", "GPT2Tokenizer" if is_tokenizers_available() else None),
160
159
  ("gpt_neo", "GPT2Tokenizer" if is_tokenizers_available() else None),
161
160
  ("gpt_neox", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
162
161
  ("gpt_neox_japanese", "GPTNeoXJapaneseTokenizer"),
163
- ("gpt_oss", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
162
+ ("gpt_oss", "TokenizersBackend" if is_tokenizers_available() else None),
164
163
  ("gptj", "GPT2Tokenizer" if is_tokenizers_available() else None),
165
164
  ("granite", "GPT2Tokenizer"),
166
165
  ("granitemoe", "GPT2Tokenizer"),
@@ -168,35 +167,35 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
168
167
  ("granitemoeshared", "GPT2Tokenizer"),
169
168
  ("grounding-dino", "BertTokenizer" if is_tokenizers_available() else None),
170
169
  ("groupvit", "CLIPTokenizerFast" if is_tokenizers_available() else None),
171
- ("helium", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
170
+ ("helium", "TokenizersBackend" if is_tokenizers_available() else None),
172
171
  ("herbert", "HerbertTokenizer" if is_tokenizers_available() else None),
173
172
  ("hubert", "Wav2Vec2CTCTokenizer"),
174
173
  ("ibert", "RobertaTokenizer"),
175
- ("idefics", "LlamaTokenizerFast" if is_tokenizers_available() else None),
176
- ("idefics2", "LlamaTokenizerFast" if is_tokenizers_available() else None),
177
- ("idefics3", "LlamaTokenizerFast" if is_tokenizers_available() else None),
174
+ ("idefics", "LlamaTokenizer" if is_tokenizers_available() else None),
175
+ ("idefics2", "LlamaTokenizer" if is_tokenizers_available() else None),
176
+ ("idefics3", "LlamaTokenizer" if is_tokenizers_available() else None),
178
177
  ("instructblip", "GPT2Tokenizer" if is_tokenizers_available() else None),
179
178
  ("instructblipvideo", "GPT2Tokenizer" if is_tokenizers_available() else None),
180
179
  ("internvl", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
181
- ("jamba", "LlamaTokenizerFast" if is_tokenizers_available() else None),
182
- ("janus", "LlamaTokenizerFast" if is_tokenizers_available() else None),
183
- ("jetmoe", "LlamaTokenizerFast" if is_tokenizers_available() else None),
180
+ ("jamba", "LlamaTokenizer" if is_tokenizers_available() else None),
181
+ ("janus", "LlamaTokenizer" if is_tokenizers_available() else None),
182
+ ("jetmoe", "LlamaTokenizer" if is_tokenizers_available() else None),
184
183
  ("kosmos-2", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
185
- ("kosmos-2.5", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
184
+ ("kosmos-2.5", "TokenizersBackend" if is_tokenizers_available() else None),
186
185
  ("layoutlm", "BertTokenizer" if is_tokenizers_available() else None),
187
186
  ("layoutlmv2", "LayoutLMv2Tokenizer" if is_tokenizers_available() else None),
188
187
  ("layoutlmv3", "LayoutLMv3Tokenizer" if is_tokenizers_available() else None),
189
188
  ("layoutxlm", "LayoutXLMTokenizer" if is_tokenizers_available() else None),
190
189
  ("led", "LEDTokenizer" if is_tokenizers_available() else None),
191
- ("lfm2_vl", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
190
+ ("lfm2_vl", "TokenizersBackend" if is_tokenizers_available() else None),
192
191
  ("lilt", "RobertaTokenizer" if is_tokenizers_available() else None),
193
192
  ("llama", "LlamaTokenizer" if is_tokenizers_available() else None),
194
- ("llama4", "LlamaTokenizerFast" if is_tokenizers_available() else None),
195
- ("llama4_text", "LlamaTokenizerFast" if is_tokenizers_available() else None),
196
- ("llava", "LlamaTokenizerFast" if is_tokenizers_available() else None),
197
- ("llava_next", "LlamaTokenizerFast" if is_tokenizers_available() else None),
198
- ("llava_next_video", "LlamaTokenizerFast" if is_tokenizers_available() else None),
199
- ("llava_onevision", "LlamaTokenizerFast" if is_tokenizers_available() else None),
193
+ ("llama4", "LlamaTokenizer" if is_tokenizers_available() else None),
194
+ ("llama4_text", "LlamaTokenizer" if is_tokenizers_available() else None),
195
+ ("llava", "LlamaTokenizer" if is_tokenizers_available() else None),
196
+ ("llava_next", "LlamaTokenizer" if is_tokenizers_available() else None),
197
+ ("llava_next_video", "LlamaTokenizer" if is_tokenizers_available() else None),
198
+ ("llava_onevision", "LlamaTokenizer" if is_tokenizers_available() else None),
200
199
  ("longformer", "RobertaTokenizer" if is_tokenizers_available() else None),
201
200
  ("longt5", "T5Tokenizer" if is_tokenizers_available() else None),
202
201
  ("luke", "LukeTokenizer"),
@@ -218,14 +217,14 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
218
217
  "MistralCommonBackend"
219
218
  if is_mistral_common_available()
220
219
  else ("LlamaTokenizer" if is_sentencepiece_available() else None),
221
- "LlamaTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None,
220
+ "LlamaTokenizer" if is_tokenizers_available() and not is_mistral_common_available() else None,
222
221
  ),
223
222
  ),
224
223
  (
225
224
  "mistral",
226
225
  "MistralCommonBackend"
227
226
  if is_mistral_common_available()
228
- else ("LlamaTokenizerFast" if is_tokenizers_available() else None),
227
+ else ("LlamaTokenizer" if is_tokenizers_available() else None),
229
228
  ),
230
229
  (
231
230
  "mistral3",
@@ -233,22 +232,22 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
233
232
  "MistralCommonBackend"
234
233
  if is_mistral_common_available()
235
234
  else ("LlamaTokenizer" if is_sentencepiece_available() else None),
236
- "LlamaTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None,
235
+ "LlamaTokenizer" if is_tokenizers_available() and not is_mistral_common_available() else None,
237
236
  ),
238
237
  ),
239
238
  (
240
239
  "mixtral",
241
240
  "MistralCommonBackend"
242
241
  if is_mistral_common_available()
243
- else ("LlamaTokenizerFast" if is_tokenizers_available() else None),
242
+ else ("LlamaTokenizer" if is_tokenizers_available() else None),
244
243
  ),
245
- ("mllama", "LlamaTokenizerFast" if is_tokenizers_available() else None),
244
+ ("mllama", "LlamaTokenizer" if is_tokenizers_available() else None),
246
245
  ("mluke", "MLukeTokenizer" if is_sentencepiece_available() else None),
247
246
  ("mm-grounding-dino", "BertTokenizer" if is_tokenizers_available() else None),
248
247
  ("mobilebert", "MobileBertTokenizer" if is_tokenizers_available() else None),
249
- ("modernbert", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
250
- ("moonshine", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
251
- ("moshi", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
248
+ ("modernbert", "TokenizersBackend" if is_tokenizers_available() else None),
249
+ ("moonshine", "TokenizersBackend" if is_tokenizers_available() else None),
250
+ ("moshi", "TokenizersBackend" if is_tokenizers_available() else None),
252
251
  ("mpnet", "MPNetTokenizer" if is_tokenizers_available() else None),
253
252
  ("mpt", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None),
254
253
  ("mra", "RobertaTokenizer"),
@@ -257,7 +256,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
257
256
  ("musicgen_melody", "T5Tokenizer" if is_tokenizers_available() else None),
258
257
  ("mvp", "MvpTokenizer" if is_tokenizers_available() else None),
259
258
  ("myt5", "MyT5Tokenizer"),
260
- ("nemotron", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
259
+ ("nemotron", "TokenizersBackend" if is_tokenizers_available() else None),
261
260
  ("nezha", "BertTokenizer" if is_tokenizers_available() else None),
262
261
  ("nllb", "NllbTokenizer" if is_tokenizers_available() else None),
263
262
  ("nllb-moe", "NllbTokenizer" if is_tokenizers_available() else None),
@@ -274,21 +273,22 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
274
273
  ("ovis2", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
275
274
  ("owlv2", "CLIPTokenizerFast" if is_tokenizers_available() else None),
276
275
  ("owlvit", "CLIPTokenizerFast" if is_tokenizers_available() else None),
277
- ("paligemma", "LlamaTokenizerFast" if is_tokenizers_available() else None),
276
+ ("paddleocr_vl", "TokenizersBackend" if is_tokenizers_available() else None),
277
+ ("paligemma", "LlamaTokenizer" if is_tokenizers_available() else None),
278
278
  ("pegasus", "PegasusTokenizer" if is_tokenizers_available() else None),
279
279
  ("pegasus_x", "PegasusTokenizer" if is_tokenizers_available() else None),
280
280
  ("perceiver", "PerceiverTokenizer"),
281
- ("persimmon", "LlamaTokenizerFast" if is_tokenizers_available() else None),
281
+ ("persimmon", "LlamaTokenizer" if is_tokenizers_available() else None),
282
282
  ("phi", "GPT2Tokenizer" if is_tokenizers_available() else None),
283
- ("phi3", "LlamaTokenizerFast" if is_tokenizers_available() else None),
284
- ("phimoe", "LlamaTokenizerFast" if is_tokenizers_available() else None),
283
+ ("phi3", "LlamaTokenizer" if is_tokenizers_available() else None),
284
+ ("phimoe", "LlamaTokenizer" if is_tokenizers_available() else None),
285
285
  ("phobert", "PhobertTokenizer"),
286
286
  ("pix2struct", "T5Tokenizer" if is_tokenizers_available() else None),
287
287
  (
288
288
  "pixtral",
289
289
  "MistralCommonBackend"
290
290
  if is_mistral_common_available()
291
- else ("PreTrainedTokenizerFast" if is_tokenizers_available() else None),
291
+ else ("TokenizersBackend" if is_tokenizers_available() else None),
292
292
  ),
293
293
  ("plbart", "PLBartTokenizer" if is_tokenizers_available() else None),
294
294
  ("prophetnet", "ProphetNetTokenizer"),
@@ -314,14 +314,14 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
314
314
  ("roberta", "RobertaTokenizer"),
315
315
  ("roberta-prelayernorm", "RobertaTokenizer"),
316
316
  ("roc_bert", "RoCBertTokenizer"),
317
- ("roformer", "RoFormerTokenizerFast" if is_tokenizers_available() else None),
317
+ ("roformer", "RoFormerTokenizer" if is_tokenizers_available() else None),
318
318
  ("rwkv", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None),
319
319
  ("seamless_m4t", "SeamlessM4TTokenizer" if is_tokenizers_available() else None),
320
320
  ("seamless_m4t_v2", "SeamlessM4TTokenizer" if is_tokenizers_available() else None),
321
321
  ("shieldgemma2", "GemmaTokenizerFast" if is_tokenizers_available() else None),
322
322
  ("siglip", "SiglipTokenizer" if is_sentencepiece_available() else None),
323
323
  ("siglip2", "GemmaTokenizerFast" if is_tokenizers_available() else None),
324
- ("smollm3", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
324
+ ("smollm3", "TokenizersBackend" if is_tokenizers_available() else None),
325
325
  ("speech_to_text", "Speech2TextTokenizer" if is_sentencepiece_available() else None),
326
326
  ("speecht5", "SpeechT5Tokenizer" if is_sentencepiece_available() else None),
327
327
  ("splinter", "SplinterTokenizer"),
@@ -336,16 +336,16 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
336
336
  ("tvp", "BertTokenizer" if is_tokenizers_available() else None),
337
337
  ("udop", "UdopTokenizer" if is_tokenizers_available() else None),
338
338
  ("umt5", "T5Tokenizer" if is_tokenizers_available() else None),
339
- ("video_llava", "LlamaTokenizerFast" if is_tokenizers_available() else None),
339
+ ("video_llava", "LlamaTokenizer" if is_tokenizers_available() else None),
340
340
  ("vilt", "BertTokenizer" if is_tokenizers_available() else None),
341
- ("vipllava", "LlamaTokenizerFast" if is_tokenizers_available() else None),
341
+ ("vipllava", "LlamaTokenizer" if is_tokenizers_available() else None),
342
342
  ("visual_bert", "BertTokenizer" if is_tokenizers_available() else None),
343
343
  ("vits", "VitsTokenizer"),
344
344
  (
345
345
  "voxtral",
346
346
  "MistralCommonBackend"
347
347
  if is_mistral_common_available()
348
- else ("PreTrainedTokenizerFast" if is_tokenizers_available() else None),
348
+ else ("LlamaTokenizer" if is_tokenizers_available() else None),
349
349
  ),
350
350
  ("wav2vec2", "Wav2Vec2CTCTokenizer"),
351
351
  ("wav2vec2-bert", "Wav2Vec2CTCTokenizer"),
@@ -361,8 +361,8 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
361
361
  ("xlstm", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None),
362
362
  ("xmod", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None),
363
363
  ("yoso", "AlbertTokenizer" if is_tokenizers_available() else None),
364
- ("zamba", "LlamaTokenizerFast" if is_tokenizers_available() else None),
365
- ("zamba2", "LlamaTokenizerFast" if is_tokenizers_available() else None),
364
+ ("zamba", "LlamaTokenizer" if is_tokenizers_available() else None),
365
+ ("zamba2", "LlamaTokenizer" if is_tokenizers_available() else None),
366
366
  ]
367
367
  )
368
368
 
@@ -389,13 +389,17 @@ def load_merges(merges_file):
389
389
 
390
390
 
391
391
  def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]:
392
+ # Bloom tokenizer classes were removed but should map to the fast backend for BC
393
+ if class_name in {"BloomTokenizer", "BloomTokenizerFast"}:
394
+ return TokenizersBackend
395
+
392
396
  if class_name in REGISTERED_FAST_ALIASES:
393
397
  return REGISTERED_FAST_ALIASES[class_name]
394
398
 
395
399
  if class_name in REGISTERED_TOKENIZER_CLASSES:
396
400
  return REGISTERED_TOKENIZER_CLASSES[class_name]
397
401
 
398
- if class_name == "PreTrainedTokenizerFast":
402
+ if class_name == "TokenizersBackend":
399
403
  return TokenizersBackend
400
404
 
401
405
  # V5: TOKENIZER_MAPPING_NAMES now maps to single strings, not tuples
@@ -404,7 +408,7 @@ def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]:
404
408
  module_name = model_type_to_module_name(module_name)
405
409
  if (
406
410
  module_name in ["mistral", "mistral3", "mixtral", "ministral", "ministral3", "pixtral", "voxtral"]
407
- and class_name == "MistralCommonTokenizer"
411
+ and class_name == "MistralCommonBackend"
408
412
  ):
409
413
  module = importlib.import_module(".tokenization_mistral_common", "transformers")
410
414
  else:
@@ -428,402 +432,6 @@ def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]:
428
432
  return None
429
433
 
430
434
 
431
- def _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs):
432
- # Delegate to shared helper to avoid duplication
433
- return find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs)
434
-
435
-
436
- def _load_tokenizers_backend(tokenizer_class, pretrained_model_name_or_path, inputs, kwargs):
437
- """
438
- Load a tokenizer using only the tokenizers backend (no SentencePiece fallback).
439
-
440
- This function attempts to load with the following priority:
441
- 1. If tokenizer.json exists, load directly
442
- 2. If any .model file (SPM) exists, try extracting vocab and merges
443
- 3. If vocab.json and merges.txt exist, load with those
444
- 4. If vocab.txt exists (WordPiece models), load with that
445
-
446
- Args:
447
- tokenizer_class: The tokenizer class to instantiate
448
- pretrained_model_name_or_path: Path or model id
449
- inputs: Additional positional arguments for tokenizer init
450
- kwargs: Additional keyword arguments
451
-
452
- Returns:
453
- An instantiated tokenizer object
454
-
455
- Raises:
456
- ValueError: If tokenizer could not be loaded with tokenizers backend
457
- """
458
- files_loaded = []
459
-
460
- # Try tokenizer.json first
461
- try:
462
- tokenizer_json_exists = has_file(
463
- pretrained_model_name_or_path,
464
- "tokenizer.json",
465
- revision=kwargs.get("revision"),
466
- token=kwargs.get("token"),
467
- cache_dir=kwargs.get("cache_dir"),
468
- local_files_only=kwargs.get("local_files_only", False),
469
- )
470
- except Exception:
471
- tokenizer_json_exists = False
472
-
473
- if tokenizer_json_exists:
474
- files_loaded.append("tokenizer.json")
475
- kwargs["backend"] = "tokenizers"
476
- kwargs["files_loaded"] = files_loaded
477
- # Some old models have uploaded a tokenizer.json but haven't updated tokenizer_config.json to point to the correct tokenizer class
478
- tokenizer_class = (
479
- TokenizersBackend
480
- if tokenizer_class.__name__ in ("PythonBackend", "PreTrainedTokenizer")
481
- else tokenizer_class
482
- )
483
- return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
484
-
485
- # Try tekken.json (Mistral format)
486
- try:
487
- if has_file(
488
- pretrained_model_name_or_path,
489
- "tekken.json",
490
- revision=kwargs.get("revision"),
491
- token=kwargs.get("token"),
492
- cache_dir=kwargs.get("cache_dir"),
493
- local_files_only=kwargs.get("local_files_only", False),
494
- ):
495
- from ...integrations.mistral import convert_tekken_tokenizer
496
-
497
- tekken_file = cached_file(
498
- pretrained_model_name_or_path,
499
- "tekken.json",
500
- **{
501
- k: v
502
- for k, v in kwargs.items()
503
- if k
504
- in ["cache_dir", "force_download", "proxies", "token", "revision", "local_files_only", "subfolder"]
505
- },
506
- )
507
- if tekken_file is not None:
508
- files_loaded.append("tekken.json")
509
- kwargs["backend"] = "tokenizers"
510
- kwargs["files_loaded"] = files_loaded
511
- return convert_tekken_tokenizer(tekken_file)
512
- except (ImportError, Exception):
513
- pass
514
-
515
- # Try extracting from SentencePiece model
516
- spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs)
517
- if spm_file is not None:
518
- try:
519
- resolved_spm = cached_file(
520
- pretrained_model_name_or_path,
521
- spm_file,
522
- cache_dir=kwargs.get("cache_dir"),
523
- force_download=kwargs.get("force_download", False),
524
- proxies=kwargs.get("proxies"),
525
- token=kwargs.get("token"),
526
- revision=kwargs.get("revision"),
527
- local_files_only=kwargs.get("local_files_only", False),
528
- subfolder=kwargs.get("subfolder", ""),
529
- )
530
- except Exception:
531
- resolved_spm = None
532
-
533
- if resolved_spm is not None:
534
- try:
535
- from ...tokenization_utils_sentencepiece import SentencePieceExtractor
536
-
537
- fast_sig = inspect.signature(getattr(tokenizer_class, "__init__", tokenizer_class))
538
- if "vocab" in fast_sig.parameters:
539
- try:
540
- vocab_ids, vocab_scores, merges = SentencePieceExtractor(resolved_spm).extract()
541
- files_loaded.append(spm_file)
542
- kwargs["backend"] = "tokenizers"
543
- kwargs["files_loaded"] = files_loaded
544
- # If tokenizer needs both vocab and merges (BPE models)
545
- if "merges" in fast_sig.parameters:
546
- return tokenizer_class.from_pretrained(
547
- pretrained_model_name_or_path, *inputs, vocab=vocab_scores, merges=merges, **kwargs
548
- )
549
- # If tokenizer only needs vocab (Unigram models like NLLB, SeamlessM4T)
550
- else:
551
- return tokenizer_class.from_pretrained(
552
- pretrained_model_name_or_path, *inputs, vocab=vocab_scores, **kwargs
553
- )
554
- except Exception:
555
- pass
556
- except ImportError as e:
557
- if "sentencepiece" in str(e).lower() or "SentencePiece" in str(e):
558
- raise ImportError(
559
- f"This checkpoint only contains a SentencePiece model file ({spm_file}), but the `sentencepiece` library is not installed. "
560
- f"Please install sentencepiece to load this tokenizer: `pip install sentencepiece`"
561
- ) from e
562
- raise
563
- except Exception:
564
- pass
565
-
566
- vocab, merges, loaded = load_vocab_and_merges(pretrained_model_name_or_path, **kwargs)
567
- if vocab is not None:
568
- files_loaded.extend(loaded)
569
- if issubclass(tokenizer_class, PreTrainedTokenizer):
570
- kwargs["backend"] = "python"
571
- else:
572
- kwargs["backend"] = "tokenizers"
573
- kwargs["files_loaded"] = files_loaded
574
- if merges is not None:
575
- return tokenizer_class.from_pretrained(
576
- pretrained_model_name_or_path, *inputs, vocab=vocab, merges=merges, **kwargs
577
- )
578
- else:
579
- return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, vocab=vocab, **kwargs)
580
-
581
- # Try vocab.txt (WordPiece models like SplinterTokenizer)
582
- try:
583
- resolved_vocab_txt = cached_file(
584
- pretrained_model_name_or_path,
585
- "vocab.txt",
586
- cache_dir=kwargs.get("cache_dir"),
587
- force_download=kwargs.get("force_download", False),
588
- proxies=kwargs.get("proxies"),
589
- token=kwargs.get("token"),
590
- revision=kwargs.get("revision"),
591
- local_files_only=kwargs.get("local_files_only", False),
592
- subfolder=kwargs.get("subfolder", ""),
593
- )
594
- except Exception:
595
- resolved_vocab_txt = None
596
-
597
- if resolved_vocab_txt is not None:
598
- try:
599
- fast_sig = inspect.signature(getattr(tokenizer_class, "__init__", tokenizer_class))
600
- if "vocab" in fast_sig.parameters:
601
- # Load vocab.txt: each line is a token, line number is the ID
602
- vocab = OrderedDict()
603
- with open(resolved_vocab_txt, "r", encoding="utf-8") as reader:
604
- tokens = reader.readlines()
605
- for index, token in enumerate(tokens):
606
- token = token.rstrip("\n")
607
- vocab[token] = index
608
- files_loaded.append("vocab.txt")
609
- kwargs["backend"] = "tokenizers"
610
- kwargs["files_loaded"] = files_loaded
611
- return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, vocab=vocab, **kwargs)
612
- except Exception:
613
- pass
614
-
615
- # If all methods failed, raise an error
616
- raise ValueError(
617
- f"Could not load tokenizer from {pretrained_model_name_or_path} using tokenizers backend. "
618
- "No tokenizer.json, tekken.json, vocab.json+merges.txt, vocab.txt, or compatible SentencePiece model found."
619
- )
620
-
621
-
622
- def _try_load_tokenizer_with_fallbacks(tokenizer_class, pretrained_model_name_or_path, inputs, kwargs):
623
- """
624
- Try to load a tokenizer with backend selection.
625
-
626
- This function routes to the appropriate backend based on the 'backend' parameter:
627
- - "tokenizers" (default): Uses HuggingFace tokenizers library backend
628
- - "sentencepiece": Uses SentencePiece backend
629
-
630
- For the tokenizers backend, attempts to load with the following priority:
631
- 1. If tokenizer.json exists, load directly
632
- 2. If any .model file (SPM) exists, try extracting vocab and merges
633
- 3. If vocab.json and merges.txt exist, load with those
634
- 4. Fallback to SentencePieceBackend if available
635
-
636
- Args:
637
- tokenizer_class: The tokenizer class to instantiate (can be None)
638
- pretrained_model_name_or_path: Path or model id
639
- inputs: Additional positional arguments for tokenizer init
640
- kwargs: Additional keyword arguments (may include 'backend' parameter, defaults to "tokenizers")
641
-
642
- Returns:
643
- An instantiated tokenizer object
644
-
645
- Raises:
646
- ValueError: If no tokenizer could be loaded
647
- """
648
- # Extract the backend parameter - default to "tokenizers" to prioritize tokenizers backend
649
- backend = kwargs.pop("backend", "tokenizers")
650
-
651
- # Validate backend parameter
652
- if backend not in ["sentencepiece", "tokenizers"]:
653
- logger.warning(
654
- f"Invalid backend '{backend}' specified. Valid options are 'tokenizers' or 'sentencepiece'. "
655
- "Defaulting to 'tokenizers' backend."
656
- )
657
- backend = "tokenizers"
658
-
659
- # Route to SentencePiece backend if requested
660
- if backend == "sentencepiece":
661
- if SentencePieceBackend is None:
662
- raise ValueError(
663
- "SentencePiece backend was requested but sentencepiece is not installed. "
664
- "Please install it with: pip install sentencepiece"
665
- )
666
- logger.info("Loading tokenizer with SentencePiece backend")
667
- # Track files loaded for SentencePiece backend
668
- spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs)
669
- files_loaded = [spm_file] if spm_file else []
670
- kwargs["backend"] = "sentencepiece"
671
- kwargs["files_loaded"] = files_loaded
672
- # Resolve the SPM file path and pass it as vocab_file
673
- if spm_file is not None:
674
- resolved_vocab_file = cached_file(
675
- pretrained_model_name_or_path,
676
- spm_file,
677
- cache_dir=kwargs.get("cache_dir"),
678
- force_download=kwargs.get("force_download", False),
679
- proxies=kwargs.get("proxies"),
680
- token=kwargs.get("token"),
681
- revision=kwargs.get("revision"),
682
- local_files_only=kwargs.get("local_files_only", False),
683
- subfolder=kwargs.get("subfolder", ""),
684
- )
685
- kwargs["vocab_file"] = resolved_vocab_file
686
- if isinstance(tokenizer_class, type) and issubclass(tokenizer_class, SentencePieceBackend):
687
- logger.info("Loading tokenizer with SentencePiece backend using tokenizer class")
688
- return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
689
- return SentencePieceBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
690
-
691
- # Route to tokenizers backend (default)
692
- if backend == "tokenizers":
693
- if tokenizer_class is not None:
694
- # Check if tokenizer_class inherits from PreTrainedTokenizer (but not from TokenizersBackend/SentencePieceBackend)
695
- # These are edge cases with custom logic (e.g., BioGptTokenizer with Moses tokenization)
696
- from ...tokenization_python import PreTrainedTokenizer
697
-
698
- # Build list of backend classes to check against
699
- backend_classes = [TokenizersBackend] if TokenizersBackend else []
700
- if SentencePieceBackend:
701
- backend_classes.append(SentencePieceBackend)
702
-
703
- # Check if it's a custom PreTrainedTokenizer (not a backend class)
704
- is_custom_pre_trained = (
705
- isinstance(tokenizer_class, type)
706
- and issubclass(tokenizer_class, PreTrainedTokenizer)
707
- and not any(issubclass(tokenizer_class, bc) for bc in backend_classes)
708
- and tokenizer_class.__name__ not in ("PythonBackend", "PreTrainedTokenizer")
709
- )
710
-
711
- # Check if it's a completely custom tokenizer (not PreTrainedTokenizer, not backend class)
712
- # e.g., MistralCommonBackend which has its own from_pretrained logic
713
- inherits_from_backend = isinstance(tokenizer_class, type) and any(
714
- bc and issubclass(tokenizer_class, bc) for bc in backend_classes
715
- )
716
- is_completely_custom = (
717
- isinstance(tokenizer_class, type)
718
- and not issubclass(tokenizer_class, PythonBackend)
719
- and not inherits_from_backend
720
- )
721
-
722
- if is_custom_pre_trained:
723
- logger.info("Loading tokenizer with custom PreTrainedTokenizer backend (edge case)")
724
- # Track the backend type for custom tokenizers
725
- kwargs["backend"] = "custom"
726
- kwargs["files_loaded"] = [] # Custom tokenizers may load various files
727
- return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
728
-
729
- if is_completely_custom:
730
- # For completely custom tokenizers (like MistralCommonBackend), try calling from_pretrained directly
731
- logger.info("Loading tokenizer with custom tokenizer class (non-PreTrainedTokenizer)")
732
- # Filter out AutoTokenizer-specific kwargs that custom tokenizers don't accept
733
- custom_kwargs = {k: v for k, v in kwargs.items() if k not in ["backend", "files_loaded"]}
734
- custom_kwargs["_from_auto"] = True # Signal that this is called from AutoTokenizer
735
- return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **custom_kwargs)
736
-
737
- if TokenizersBackend is None:
738
- raise ValueError(
739
- "Tokenizers backend is the default but tokenizers library is not installed. "
740
- "Please install it with: pip install tokenizers"
741
- )
742
- logger.info("Loading tokenizer with tokenizers backend")
743
- try:
744
- return _load_tokenizers_backend(tokenizer_class, pretrained_model_name_or_path, inputs, kwargs)
745
- except ValueError as e:
746
- # If tokenizers backend fails, try falling back to SentencePiece backend if available
747
- spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs)
748
- if spm_file is not None and SentencePieceBackend is not None:
749
- logger.info(
750
- f"Tokenizers backend failed: {e}. "
751
- f"Falling back to SentencePieceBackend since {spm_file} file was found."
752
- )
753
- files_loaded = [spm_file]
754
- kwargs["backend"] = "sentencepiece"
755
- kwargs["files_loaded"] = files_loaded
756
- # Resolve the SPM file path and pass it as vocab_file
757
- resolved_vocab_file = cached_file(
758
- pretrained_model_name_or_path,
759
- spm_file,
760
- cache_dir=kwargs.get("cache_dir"),
761
- force_download=kwargs.get("force_download", False),
762
- proxies=kwargs.get("proxies"),
763
- token=kwargs.get("token"),
764
- revision=kwargs.get("revision"),
765
- local_files_only=kwargs.get("local_files_only", False),
766
- subfolder=kwargs.get("subfolder", ""),
767
- )
768
- kwargs["vocab_file"] = resolved_vocab_file
769
- if tokenizer_class is not None and issubclass(tokenizer_class, SentencePieceBackend):
770
- logger.info(
771
- "Falling back to SentencePiece backend using tokenizer class that inherits from SentencePieceBackend."
772
- )
773
- return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
774
- return SentencePieceBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
775
- # If no fallback available, try calling tokenizer class directly as last resort
776
- if hasattr(tokenizer_class, "from_pretrained"):
777
- logger.info(
778
- f"Tokenizers backend failed: {e}. Trying to load tokenizer directly from tokenizer class."
779
- )
780
- # Filter out AutoTokenizer-specific kwargs that custom tokenizers don't accept
781
- custom_kwargs = {k: v for k, v in kwargs.items() if k not in ["backend", "files_loaded"]}
782
- custom_kwargs["_from_auto"] = True # Signal that this is called from AutoTokenizer
783
- return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **custom_kwargs)
784
- # Re-raise if no fallback options available
785
- raise
786
-
787
- # If no tokenizer class but tokenizers backend requested, fall back to SentencePiece if available
788
- spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs)
789
- if spm_file is not None and SentencePieceBackend is not None:
790
- logger.info(
791
- f"Tokenizers backend was requested but no tokenizer class found. "
792
- f"Falling back to SentencePieceBackend since {spm_file} file was found."
793
- )
794
- files_loaded = [spm_file]
795
- kwargs["backend"] = "sentencepiece"
796
- kwargs["files_loaded"] = files_loaded
797
- # Resolve the SPM file path and pass it as vocab_file
798
- resolved_vocab_file = cached_file(
799
- pretrained_model_name_or_path,
800
- spm_file,
801
- cache_dir=kwargs.get("cache_dir"),
802
- force_download=kwargs.get("force_download", False),
803
- proxies=kwargs.get("proxies"),
804
- token=kwargs.get("token"),
805
- revision=kwargs.get("revision"),
806
- local_files_only=kwargs.get("local_files_only", False),
807
- subfolder=kwargs.get("subfolder", ""),
808
- )
809
- kwargs["vocab_file"] = resolved_vocab_file
810
- if (
811
- tokenizer_class is not None
812
- and SentencePieceBackend is not None
813
- and issubclass(tokenizer_class, SentencePieceBackend)
814
- ):
815
- logger.info(
816
- "Falling back to SentencePiece backend using tokenizer class that inherits from SentencePieceBackend."
817
- )
818
- return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
819
- return SentencePieceBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
820
-
821
- raise ValueError(
822
- f"Could not load tokenizer from {pretrained_model_name_or_path}. "
823
- "No tokenizer class could be determined and no SentencePiece model found."
824
- )
825
-
826
-
827
435
  def get_tokenizer_config(
828
436
  pretrained_model_name_or_path: Union[str, os.PathLike[str]],
829
437
  cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
@@ -1084,7 +692,7 @@ class AutoTokenizer:
1084
692
 
1085
693
  if (
1086
694
  config_tokenizer_class is not None
1087
- and config_tokenizer_class != "PreTrainedTokenizerFast"
695
+ and config_tokenizer_class != "TokenizersBackend"
1088
696
  and "Fast" in config_tokenizer_class
1089
697
  ):
1090
698
  config_tokenizer_class = config_tokenizer_class[:-4]
@@ -1125,10 +733,12 @@ class AutoTokenizer:
1125
733
  tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
1126
734
  if tokenizer_class is None and not tokenizer_class_candidate.endswith("Fast"):
1127
735
  tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate + "Fast")
736
+ if tokenizer_class.__name__ == "PythonBackend": # unless you inherit from it?
737
+ tokenizer_class = TokenizersBackend
1128
738
  else:
1129
739
  tokenizer_class = fast_tokenizer_class
1130
740
 
1131
- return _try_load_tokenizer_with_fallbacks(tokenizer_class, pretrained_model_name_or_path, inputs, kwargs)
741
+ return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
1132
742
 
1133
743
  # Otherwise we have to be creative.
1134
744
  # if model is an encoder decoder, the encoder tokenizer class is used by default
@@ -1144,17 +754,9 @@ class AutoTokenizer:
1144
754
 
1145
755
  model_type = config_class_to_model_type(type(config).__name__)
1146
756
  if model_type is not None:
1147
- tokenizer_class = TOKENIZER_MAPPING[type(config)]
1148
-
757
+ tokenizer_class = TOKENIZER_MAPPING.get(type(config), TokenizersBackend)
1149
758
  if tokenizer_class is not None:
1150
- return _try_load_tokenizer_with_fallbacks(
1151
- tokenizer_class, pretrained_model_name_or_path, inputs, kwargs
1152
- )
1153
- else:
1154
- raise ValueError(
1155
- "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
1156
- "in order to use this tokenizer."
1157
- )
759
+ return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
1158
760
 
1159
761
  raise ValueError(
1160
762
  f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"