transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +30 -3
- transformers/cli/serve.py +47 -17
- transformers/conversion_mapping.py +15 -2
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +196 -135
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +1 -2
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +1 -2
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/configuration_utils.py +3 -2
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/continuous_api.py +134 -79
- transformers/image_processing_base.py +1 -2
- transformers/integrations/__init__.py +4 -2
- transformers/integrations/accelerate.py +15 -3
- transformers/integrations/aqlm.py +38 -66
- transformers/integrations/awq.py +48 -514
- transformers/integrations/bitnet.py +45 -100
- transformers/integrations/bitsandbytes.py +79 -191
- transformers/integrations/deepspeed.py +1 -0
- transformers/integrations/eetq.py +84 -79
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +236 -193
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +40 -62
- transformers/integrations/hub_kernels.py +42 -3
- transformers/integrations/integration_utils.py +10 -0
- transformers/integrations/mxfp4.py +25 -65
- transformers/integrations/peft.py +7 -29
- transformers/integrations/quanto.py +73 -55
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +44 -90
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +42 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +8 -0
- transformers/modeling_rope_utils.py +30 -6
- transformers/modeling_utils.py +116 -112
- transformers/models/__init__.py +3 -0
- transformers/models/afmoe/modeling_afmoe.py +4 -4
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +2 -0
- transformers/models/altclip/modeling_altclip.py +4 -0
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/modeling_aria.py +4 -4
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/auto/configuration_auto.py +11 -0
- transformers/models/auto/feature_extraction_auto.py +2 -0
- transformers/models/auto/image_processing_auto.py +1 -0
- transformers/models/auto/modeling_auto.py +6 -0
- transformers/models/auto/processing_auto.py +18 -10
- transformers/models/auto/tokenization_auto.py +74 -472
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/bamba/modeling_bamba.py +4 -3
- transformers/models/bark/modeling_bark.py +2 -0
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/big_bird/modeling_big_bird.py +6 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +11 -2
- transformers/models/bitnet/modeling_bitnet.py +4 -4
- transformers/models/blenderbot/modeling_blenderbot.py +5 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
- transformers/models/blip/modeling_blip_text.py +2 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -1
- transformers/models/bloom/modeling_bloom.py +4 -0
- transformers/models/blt/modeling_blt.py +2 -2
- transformers/models/blt/modular_blt.py +2 -2
- transformers/models/bridgetower/modeling_bridgetower.py +5 -1
- transformers/models/bros/modeling_bros.py +4 -0
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +5 -0
- transformers/models/chameleon/modeling_chameleon.py +2 -1
- transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
- transformers/models/clap/modeling_clap.py +5 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +5 -0
- transformers/models/clvp/modeling_clvp.py +5 -0
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +4 -3
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +7 -6
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
- transformers/models/convbert/modeling_convbert.py +6 -0
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/modeling_csm.py +4 -3
- transformers/models/ctrl/modeling_ctrl.py +1 -0
- transformers/models/cvt/modeling_cvt.py +2 -0
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/modeling_d_fine.py +2 -0
- transformers/models/d_fine/modular_d_fine.py +1 -0
- transformers/models/dab_detr/modeling_dab_detr.py +4 -0
- transformers/models/dac/modeling_dac.py +2 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/dbrx/modeling_dbrx.py +2 -2
- transformers/models/deberta/modeling_deberta.py +5 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
- transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
- transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
- transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/modeling_detr.py +5 -0
- transformers/models/dia/modeling_dia.py +4 -3
- transformers/models/dia/modular_dia.py +0 -1
- transformers/models/diffllama/modeling_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +2 -3
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +2 -0
- transformers/models/dots1/modeling_dots1.py +10 -7
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/edgetam/modeling_edgetam.py +1 -1
- transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
- transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
- transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
- transformers/models/efficientnet/modeling_efficientnet.py +2 -0
- transformers/models/emu3/modeling_emu3.py +4 -4
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +14 -2
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
- transformers/models/esm/modeling_esmfold.py +5 -4
- transformers/models/evolla/modeling_evolla.py +4 -4
- transformers/models/exaone4/modeling_exaone4.py +2 -2
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +6 -1
- transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
- transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
- transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
- transformers/models/flaubert/modeling_flaubert.py +7 -0
- transformers/models/flava/modeling_flava.py +6 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
- transformers/models/florence2/modeling_florence2.py +2 -1
- transformers/models/florence2/modular_florence2.py +2 -1
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/processing_fuyu.py +3 -3
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +2 -1
- transformers/models/gemma3/modeling_gemma3.py +14 -84
- transformers/models/gemma3/modular_gemma3.py +12 -81
- transformers/models/gemma3n/modeling_gemma3n.py +18 -209
- transformers/models/gemma3n/modular_gemma3n.py +17 -59
- transformers/models/git/modeling_git.py +2 -0
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/modeling_glm4v.py +3 -3
- transformers/models/glm4v/modular_glm4v.py +6 -4
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
- transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/gpt2/modeling_gpt2.py +5 -1
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
- transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
- transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
- transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
- transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
- transformers/models/gptj/modeling_gptj.py +3 -0
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granitemoe/modeling_granitemoe.py +4 -6
- transformers/models/granitemoe/modular_granitemoe.py +0 -2
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
- transformers/models/groupvit/modeling_groupvit.py +3 -0
- transformers/models/helium/modeling_helium.py +4 -3
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +3 -0
- transformers/models/hubert/modular_hubert.py +1 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
- transformers/models/ibert/modeling_ibert.py +6 -0
- transformers/models/idefics/modeling_idefics.py +5 -21
- transformers/models/imagegpt/modeling_imagegpt.py +2 -1
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/internvl/modeling_internvl.py +2 -4
- transformers/models/internvl/modular_internvl.py +2 -4
- transformers/models/jamba/modeling_jamba.py +2 -2
- transformers/models/janus/modeling_janus.py +1 -0
- transformers/models/janus/modular_janus.py +1 -0
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/kosmos2/modeling_kosmos2.py +1 -0
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +244 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +729 -0
- transformers/models/lasr/modular_lasr.py +569 -0
- transformers/models/lasr/processing_lasr.py +96 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +5 -0
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +3 -0
- transformers/models/lfm2/modeling_lfm2.py +4 -5
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +4 -0
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/modeling_llama4.py +3 -2
- transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
- transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -0
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +4 -0
- transformers/models/mamba/modeling_mamba.py +14 -22
- transformers/models/marian/modeling_marian.py +5 -0
- transformers/models/markuplm/modeling_markuplm.py +4 -0
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/modeling_mask2former.py +2 -0
- transformers/models/maskformer/modeling_maskformer.py +2 -0
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +3 -1
- transformers/models/minimax/modeling_minimax.py +4 -4
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +4 -3
- transformers/models/mistral/modeling_mistral.py +4 -3
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mllama/modeling_mllama.py +2 -2
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/modeling_mobilevit.py +3 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
- transformers/models/modernbert/modeling_modernbert.py +4 -1
- transformers/models/modernbert/modular_modernbert.py +2 -0
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
- transformers/models/moonshine/modeling_moonshine.py +4 -2
- transformers/models/moshi/modeling_moshi.py +5 -2
- transformers/models/mpnet/modeling_mpnet.py +5 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +6 -0
- transformers/models/mt5/modeling_mt5.py +7 -0
- transformers/models/musicgen/modeling_musicgen.py +2 -0
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nemotron/modeling_nemotron.py +4 -2
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nougat/tokenization_nougat.py +11 -59
- transformers/models/nystromformer/modeling_nystromformer.py +6 -0
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +4 -5
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
- transformers/models/oneformer/modeling_oneformer.py +4 -1
- transformers/models/openai/modeling_openai.py +3 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/owlv2/modeling_owlv2.py +4 -0
- transformers/models/owlvit/modeling_owlvit.py +4 -0
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +9 -6
- transformers/models/parakeet/modular_parakeet.py +2 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
- transformers/models/patchtst/modeling_patchtst.py +20 -2
- transformers/models/pegasus/modeling_pegasus.py +5 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
- transformers/models/perceiver/modeling_perceiver.py +8 -0
- transformers/models/persimmon/modeling_persimmon.py +2 -1
- transformers/models/phi/modeling_phi.py +4 -5
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +2 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
- transformers/models/phimoe/modeling_phimoe.py +4 -4
- transformers/models/phimoe/modular_phimoe.py +2 -2
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pixtral/modeling_pixtral.py +2 -1
- transformers/models/plbart/modeling_plbart.py +6 -0
- transformers/models/plbart/modular_plbart.py +2 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/modeling_poolformer.py +2 -0
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +3 -0
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
- transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
- transformers/models/rag/modeling_rag.py +1 -0
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
- transformers/models/reformer/modeling_reformer.py +4 -0
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +6 -1
- transformers/models/rembert/modeling_rembert.py +6 -0
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +11 -2
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/modeling_rt_detr.py +2 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
- transformers/models/rwkv/modeling_rwkv.py +1 -0
- transformers/models/sam2/modeling_sam2.py +2 -2
- transformers/models/sam2/modular_sam2.py +2 -2
- transformers/models/sam2_video/modeling_sam2_video.py +1 -0
- transformers/models/sam2_video/modular_sam2_video.py +1 -0
- transformers/models/sam3/modeling_sam3.py +77 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
- transformers/models/sam3_video/modeling_sam3_video.py +1 -0
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
- transformers/models/seed_oss/modeling_seed_oss.py +2 -2
- transformers/models/segformer/modeling_segformer.py +4 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/siglip2/modeling_siglip2.py +4 -0
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
- transformers/models/speecht5/modeling_speecht5.py +13 -1
- transformers/models/splinter/modeling_splinter.py +3 -0
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +6 -0
- transformers/models/stablelm/modeling_stablelm.py +3 -1
- transformers/models/starcoder2/modeling_starcoder2.py +4 -3
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +2 -0
- transformers/models/swin/modeling_swin.py +4 -0
- transformers/models/swin2sr/modeling_swin2sr.py +2 -0
- transformers/models/swinv2/modeling_swinv2.py +4 -0
- transformers/models/t5/modeling_t5.py +7 -0
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +5 -5
- transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
- transformers/models/table_transformer/modeling_table_transformer.py +4 -0
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +2 -0
- transformers/models/timesfm/modular_timesfm.py +2 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
- transformers/models/trocr/modeling_trocr.py +2 -0
- transformers/models/tvp/modeling_tvp.py +2 -0
- transformers/models/udop/modeling_udop.py +4 -0
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/modeling_umt5.py +7 -0
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/vilt/modeling_vilt.py +6 -0
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +6 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/modeling_vitmatte.py +1 -0
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/modeling_whisper.py +6 -0
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +3 -0
- transformers/models/xglm/modeling_xglm.py +1 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +5 -0
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/yoso/modeling_yoso.py +6 -0
- transformers/models/zamba/modeling_zamba.py +2 -0
- transformers/models/zamba2/modeling_zamba2.py +4 -2
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/modeling_zoedepth.py +1 -0
- transformers/pipelines/__init__.py +2 -3
- transformers/pipelines/base.py +1 -9
- transformers/pipelines/document_question_answering.py +3 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/processing_utils.py +23 -11
- transformers/quantizers/base.py +35 -110
- transformers/quantizers/quantizer_aqlm.py +1 -5
- transformers/quantizers/quantizer_auto_round.py +1 -2
- transformers/quantizers/quantizer_awq.py +17 -81
- transformers/quantizers/quantizer_bitnet.py +3 -8
- transformers/quantizers/quantizer_bnb_4bit.py +13 -110
- transformers/quantizers/quantizer_bnb_8bit.py +16 -92
- transformers/quantizers/quantizer_compressed_tensors.py +1 -5
- transformers/quantizers/quantizer_eetq.py +14 -62
- transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
- transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
- transformers/quantizers/quantizer_fp_quant.py +48 -78
- transformers/quantizers/quantizer_gptq.py +7 -24
- transformers/quantizers/quantizer_higgs.py +40 -54
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +13 -167
- transformers/quantizers/quantizer_quanto.py +20 -64
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +1 -4
- transformers/quantizers/quantizer_torchao.py +23 -202
- transformers/quantizers/quantizer_vptq.py +8 -22
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +297 -36
- transformers/tokenization_mistral_common.py +4 -0
- transformers/tokenization_utils_base.py +113 -222
- transformers/tokenization_utils_tokenizers.py +168 -107
- transformers/trainer.py +28 -31
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +66 -28
- transformers/utils/__init__.py +3 -4
- transformers/utils/auto_docstring.py +1 -0
- transformers/utils/generic.py +27 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +61 -16
- transformers/utils/kernel_config.py +4 -2
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +75 -242
- transformers/video_processing_utils.py +1 -2
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
"""Auto Tokenizer class."""
|
|
16
16
|
|
|
17
17
|
import importlib
|
|
18
|
-
import inspect
|
|
19
18
|
import json
|
|
20
19
|
import os
|
|
21
20
|
from collections import OrderedDict
|
|
@@ -26,8 +25,7 @@ from transformers.utils.import_utils import is_mistral_common_available
|
|
|
26
25
|
from ...configuration_utils import PreTrainedConfig
|
|
27
26
|
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
|
|
28
27
|
from ...modeling_gguf_pytorch_utils import load_gguf_checkpoint
|
|
29
|
-
from ...
|
|
30
|
-
from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE, find_sentencepiece_model_file, load_vocab_and_merges
|
|
28
|
+
from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
|
|
31
29
|
from ...utils import (
|
|
32
30
|
extract_commit_hash,
|
|
33
31
|
is_g2p_en_available,
|
|
@@ -35,7 +33,7 @@ from ...utils import (
|
|
|
35
33
|
is_tokenizers_available,
|
|
36
34
|
logging,
|
|
37
35
|
)
|
|
38
|
-
from ...utils.hub import cached_file
|
|
36
|
+
from ...utils.hub import cached_file
|
|
39
37
|
from ..encoder_decoder import EncoderDecoderConfig
|
|
40
38
|
from .auto_factory import _LazyAutoMapping
|
|
41
39
|
from .configuration_auto import (
|
|
@@ -68,8 +66,8 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
|
|
|
68
66
|
("aimv2", "CLIPTokenizerFast" if is_tokenizers_available() else None),
|
|
69
67
|
("albert", "AlbertTokenizer" if is_tokenizers_available() else None),
|
|
70
68
|
("align", "BertTokenizer" if is_tokenizers_available() else None),
|
|
71
|
-
("arcee", "
|
|
72
|
-
("aria", "
|
|
69
|
+
("arcee", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
70
|
+
("aria", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
73
71
|
("aya_vision", "CohereTokenizer" if is_tokenizers_available() else None),
|
|
74
72
|
("bark", "BertTokenizer" if is_tokenizers_available() else None),
|
|
75
73
|
("bart", "RobertaTokenizer" if is_tokenizers_available() else None),
|
|
@@ -82,19 +80,19 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
|
|
|
82
80
|
("big_bird", "BigBirdTokenizer" if is_tokenizers_available() else None),
|
|
83
81
|
("bigbird_pegasus", "PegasusTokenizer" if is_tokenizers_available() else None),
|
|
84
82
|
("biogpt", "BioGptTokenizer"),
|
|
85
|
-
("bitnet", "
|
|
83
|
+
("bitnet", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
86
84
|
("blenderbot", "BlenderbotTokenizer" if is_tokenizers_available() else None),
|
|
87
85
|
("blenderbot-small", "BlenderbotSmallTokenizer"),
|
|
88
86
|
("blip", "BertTokenizer" if is_tokenizers_available() else None),
|
|
89
87
|
("blip-2", "GPT2Tokenizer" if is_tokenizers_available() else None),
|
|
90
88
|
("bloom", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
91
|
-
("blt", "
|
|
89
|
+
("blt", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
92
90
|
("bridgetower", "RobertaTokenizer"),
|
|
93
91
|
("bros", "BertTokenizer" if is_tokenizers_available() else None),
|
|
94
92
|
("byt5", "ByT5Tokenizer"),
|
|
95
93
|
("camembert", "CamembertTokenizer" if is_tokenizers_available() else None),
|
|
96
94
|
("canine", "CanineTokenizer"),
|
|
97
|
-
("chameleon", "
|
|
95
|
+
("chameleon", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
98
96
|
("chinese_clip", "BertTokenizer" if is_tokenizers_available() else None),
|
|
99
97
|
("clap", "RobertaTokenizer"),
|
|
100
98
|
("clip", "CLIPTokenizer" if is_tokenizers_available() else None),
|
|
@@ -104,34 +102,34 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
|
|
|
104
102
|
("codegen", "GPT2Tokenizer" if is_tokenizers_available() else None),
|
|
105
103
|
("cohere", "CohereTokenizer" if is_tokenizers_available() else None),
|
|
106
104
|
("cohere2", "CohereTokenizer" if is_tokenizers_available() else None),
|
|
107
|
-
("colpali", "
|
|
105
|
+
("colpali", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
108
106
|
("colqwen2", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
|
|
109
107
|
("convbert", "BertTokenizer" if is_tokenizers_available() else None),
|
|
110
108
|
("cpm", "CpmTokenizer" if is_tokenizers_available() else None),
|
|
111
109
|
("cpmant", "CpmAntTokenizer"),
|
|
112
|
-
("csm", "
|
|
110
|
+
("csm", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
113
111
|
("ctrl", "CTRLTokenizer"),
|
|
114
112
|
("data2vec-audio", "Wav2Vec2CTCTokenizer"),
|
|
115
113
|
("data2vec-text", "RobertaTokenizer"),
|
|
116
114
|
("dbrx", "GPT2Tokenizer" if is_tokenizers_available() else None),
|
|
117
115
|
("deberta", "DebertaTokenizer" if is_tokenizers_available() else None),
|
|
118
116
|
("deberta-v2", "DebertaV2Tokenizer" if is_tokenizers_available() else None),
|
|
119
|
-
("deepseek_v2", "
|
|
120
|
-
("deepseek_v3", "
|
|
121
|
-
("deepseek_vl", "
|
|
122
|
-
("deepseek_vl_hybrid", "
|
|
117
|
+
("deepseek_v2", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
118
|
+
("deepseek_v3", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
119
|
+
("deepseek_vl", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
120
|
+
("deepseek_vl_hybrid", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
123
121
|
("dia", "DiaTokenizer"),
|
|
124
|
-
("diffllama", "
|
|
122
|
+
("diffllama", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
125
123
|
("distilbert", "BertTokenizer" if is_tokenizers_available() else None),
|
|
126
124
|
("dpr", "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None),
|
|
127
125
|
("electra", "BertTokenizer" if is_tokenizers_available() else None),
|
|
128
126
|
("emu3", "GPT2Tokenizer" if is_tokenizers_available() else None),
|
|
129
127
|
("ernie", "BertTokenizer" if is_tokenizers_available() else None),
|
|
130
|
-
("ernie4_5", "
|
|
131
|
-
("ernie4_5_moe", "
|
|
128
|
+
("ernie4_5", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
129
|
+
("ernie4_5_moe", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
132
130
|
("esm", "EsmTokenizer"),
|
|
133
131
|
("exaone4", "GPT2Tokenizer" if is_tokenizers_available() else None),
|
|
134
|
-
("falcon", "
|
|
132
|
+
("falcon", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
135
133
|
("falcon_mamba", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None),
|
|
136
134
|
("fastspeech2_conformer", "FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None),
|
|
137
135
|
("flaubert", "FlaubertTokenizer"),
|
|
@@ -141,6 +139,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
|
|
|
141
139
|
("fnet", "FNetTokenizerFast" if is_tokenizers_available() else None),
|
|
142
140
|
("fsmt", "FSMTTokenizer"),
|
|
143
141
|
("funnel", "FunnelTokenizer" if is_tokenizers_available() else None),
|
|
142
|
+
("fuyu", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
144
143
|
("gemma", "GemmaTokenizerFast" if is_tokenizers_available() else None),
|
|
145
144
|
("gemma2", "GemmaTokenizerFast" if is_tokenizers_available() else None),
|
|
146
145
|
("gemma3", "GemmaTokenizerFast" if is_tokenizers_available() else None),
|
|
@@ -148,19 +147,19 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
|
|
|
148
147
|
("gemma3n", "GemmaTokenizerFast" if is_tokenizers_available() else None),
|
|
149
148
|
("gemma3n_text", "GemmaTokenizerFast" if is_tokenizers_available() else None),
|
|
150
149
|
("git", "BertTokenizer" if is_tokenizers_available() else None),
|
|
151
|
-
("glm", "
|
|
152
|
-
("glm4", "
|
|
153
|
-
("glm4_moe", "
|
|
154
|
-
("glm4v", "
|
|
155
|
-
("glm4v_moe", "
|
|
156
|
-
("got_ocr2", "
|
|
150
|
+
("glm", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
151
|
+
("glm4", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
152
|
+
("glm4_moe", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
153
|
+
("glm4v", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
154
|
+
("glm4v_moe", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
155
|
+
("got_ocr2", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
157
156
|
("gpt-sw3", "GPTSw3Tokenizer" if is_sentencepiece_available() else None),
|
|
158
157
|
("gpt2", "GPT2Tokenizer" if is_tokenizers_available() else None),
|
|
159
158
|
("gpt_bigcode", "GPT2Tokenizer" if is_tokenizers_available() else None),
|
|
160
159
|
("gpt_neo", "GPT2Tokenizer" if is_tokenizers_available() else None),
|
|
161
160
|
("gpt_neox", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
|
|
162
161
|
("gpt_neox_japanese", "GPTNeoXJapaneseTokenizer"),
|
|
163
|
-
("gpt_oss", "
|
|
162
|
+
("gpt_oss", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
164
163
|
("gptj", "GPT2Tokenizer" if is_tokenizers_available() else None),
|
|
165
164
|
("granite", "GPT2Tokenizer"),
|
|
166
165
|
("granitemoe", "GPT2Tokenizer"),
|
|
@@ -168,35 +167,35 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
|
|
|
168
167
|
("granitemoeshared", "GPT2Tokenizer"),
|
|
169
168
|
("grounding-dino", "BertTokenizer" if is_tokenizers_available() else None),
|
|
170
169
|
("groupvit", "CLIPTokenizerFast" if is_tokenizers_available() else None),
|
|
171
|
-
("helium", "
|
|
170
|
+
("helium", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
172
171
|
("herbert", "HerbertTokenizer" if is_tokenizers_available() else None),
|
|
173
172
|
("hubert", "Wav2Vec2CTCTokenizer"),
|
|
174
173
|
("ibert", "RobertaTokenizer"),
|
|
175
|
-
("idefics", "
|
|
176
|
-
("idefics2", "
|
|
177
|
-
("idefics3", "
|
|
174
|
+
("idefics", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
175
|
+
("idefics2", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
176
|
+
("idefics3", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
178
177
|
("instructblip", "GPT2Tokenizer" if is_tokenizers_available() else None),
|
|
179
178
|
("instructblipvideo", "GPT2Tokenizer" if is_tokenizers_available() else None),
|
|
180
179
|
("internvl", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
|
|
181
|
-
("jamba", "
|
|
182
|
-
("janus", "
|
|
183
|
-
("jetmoe", "
|
|
180
|
+
("jamba", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
181
|
+
("janus", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
182
|
+
("jetmoe", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
184
183
|
("kosmos-2", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
|
|
185
|
-
("kosmos-2.5", "
|
|
184
|
+
("kosmos-2.5", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
186
185
|
("layoutlm", "BertTokenizer" if is_tokenizers_available() else None),
|
|
187
186
|
("layoutlmv2", "LayoutLMv2Tokenizer" if is_tokenizers_available() else None),
|
|
188
187
|
("layoutlmv3", "LayoutLMv3Tokenizer" if is_tokenizers_available() else None),
|
|
189
188
|
("layoutxlm", "LayoutXLMTokenizer" if is_tokenizers_available() else None),
|
|
190
189
|
("led", "LEDTokenizer" if is_tokenizers_available() else None),
|
|
191
|
-
("lfm2_vl", "
|
|
190
|
+
("lfm2_vl", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
192
191
|
("lilt", "RobertaTokenizer" if is_tokenizers_available() else None),
|
|
193
192
|
("llama", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
194
|
-
("llama4", "
|
|
195
|
-
("llama4_text", "
|
|
196
|
-
("llava", "
|
|
197
|
-
("llava_next", "
|
|
198
|
-
("llava_next_video", "
|
|
199
|
-
("llava_onevision", "
|
|
193
|
+
("llama4", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
194
|
+
("llama4_text", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
195
|
+
("llava", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
196
|
+
("llava_next", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
197
|
+
("llava_next_video", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
198
|
+
("llava_onevision", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
200
199
|
("longformer", "RobertaTokenizer" if is_tokenizers_available() else None),
|
|
201
200
|
("longt5", "T5Tokenizer" if is_tokenizers_available() else None),
|
|
202
201
|
("luke", "LukeTokenizer"),
|
|
@@ -218,14 +217,14 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
|
|
|
218
217
|
"MistralCommonBackend"
|
|
219
218
|
if is_mistral_common_available()
|
|
220
219
|
else ("LlamaTokenizer" if is_sentencepiece_available() else None),
|
|
221
|
-
"
|
|
220
|
+
"LlamaTokenizer" if is_tokenizers_available() and not is_mistral_common_available() else None,
|
|
222
221
|
),
|
|
223
222
|
),
|
|
224
223
|
(
|
|
225
224
|
"mistral",
|
|
226
225
|
"MistralCommonBackend"
|
|
227
226
|
if is_mistral_common_available()
|
|
228
|
-
else ("
|
|
227
|
+
else ("LlamaTokenizer" if is_tokenizers_available() else None),
|
|
229
228
|
),
|
|
230
229
|
(
|
|
231
230
|
"mistral3",
|
|
@@ -233,22 +232,22 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
|
|
|
233
232
|
"MistralCommonBackend"
|
|
234
233
|
if is_mistral_common_available()
|
|
235
234
|
else ("LlamaTokenizer" if is_sentencepiece_available() else None),
|
|
236
|
-
"
|
|
235
|
+
"LlamaTokenizer" if is_tokenizers_available() and not is_mistral_common_available() else None,
|
|
237
236
|
),
|
|
238
237
|
),
|
|
239
238
|
(
|
|
240
239
|
"mixtral",
|
|
241
240
|
"MistralCommonBackend"
|
|
242
241
|
if is_mistral_common_available()
|
|
243
|
-
else ("
|
|
242
|
+
else ("LlamaTokenizer" if is_tokenizers_available() else None),
|
|
244
243
|
),
|
|
245
|
-
("mllama", "
|
|
244
|
+
("mllama", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
246
245
|
("mluke", "MLukeTokenizer" if is_sentencepiece_available() else None),
|
|
247
246
|
("mm-grounding-dino", "BertTokenizer" if is_tokenizers_available() else None),
|
|
248
247
|
("mobilebert", "MobileBertTokenizer" if is_tokenizers_available() else None),
|
|
249
|
-
("modernbert", "
|
|
250
|
-
("moonshine", "
|
|
251
|
-
("moshi", "
|
|
248
|
+
("modernbert", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
249
|
+
("moonshine", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
250
|
+
("moshi", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
252
251
|
("mpnet", "MPNetTokenizer" if is_tokenizers_available() else None),
|
|
253
252
|
("mpt", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None),
|
|
254
253
|
("mra", "RobertaTokenizer"),
|
|
@@ -257,7 +256,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
|
|
|
257
256
|
("musicgen_melody", "T5Tokenizer" if is_tokenizers_available() else None),
|
|
258
257
|
("mvp", "MvpTokenizer" if is_tokenizers_available() else None),
|
|
259
258
|
("myt5", "MyT5Tokenizer"),
|
|
260
|
-
("nemotron", "
|
|
259
|
+
("nemotron", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
261
260
|
("nezha", "BertTokenizer" if is_tokenizers_available() else None),
|
|
262
261
|
("nllb", "NllbTokenizer" if is_tokenizers_available() else None),
|
|
263
262
|
("nllb-moe", "NllbTokenizer" if is_tokenizers_available() else None),
|
|
@@ -274,21 +273,22 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
|
|
|
274
273
|
("ovis2", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
|
|
275
274
|
("owlv2", "CLIPTokenizerFast" if is_tokenizers_available() else None),
|
|
276
275
|
("owlvit", "CLIPTokenizerFast" if is_tokenizers_available() else None),
|
|
277
|
-
("
|
|
276
|
+
("paddleocr_vl", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
277
|
+
("paligemma", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
278
278
|
("pegasus", "PegasusTokenizer" if is_tokenizers_available() else None),
|
|
279
279
|
("pegasus_x", "PegasusTokenizer" if is_tokenizers_available() else None),
|
|
280
280
|
("perceiver", "PerceiverTokenizer"),
|
|
281
|
-
("persimmon", "
|
|
281
|
+
("persimmon", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
282
282
|
("phi", "GPT2Tokenizer" if is_tokenizers_available() else None),
|
|
283
|
-
("phi3", "
|
|
284
|
-
("phimoe", "
|
|
283
|
+
("phi3", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
284
|
+
("phimoe", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
285
285
|
("phobert", "PhobertTokenizer"),
|
|
286
286
|
("pix2struct", "T5Tokenizer" if is_tokenizers_available() else None),
|
|
287
287
|
(
|
|
288
288
|
"pixtral",
|
|
289
289
|
"MistralCommonBackend"
|
|
290
290
|
if is_mistral_common_available()
|
|
291
|
-
else ("
|
|
291
|
+
else ("TokenizersBackend" if is_tokenizers_available() else None),
|
|
292
292
|
),
|
|
293
293
|
("plbart", "PLBartTokenizer" if is_tokenizers_available() else None),
|
|
294
294
|
("prophetnet", "ProphetNetTokenizer"),
|
|
@@ -314,14 +314,14 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
|
|
|
314
314
|
("roberta", "RobertaTokenizer"),
|
|
315
315
|
("roberta-prelayernorm", "RobertaTokenizer"),
|
|
316
316
|
("roc_bert", "RoCBertTokenizer"),
|
|
317
|
-
("roformer", "
|
|
317
|
+
("roformer", "RoFormerTokenizer" if is_tokenizers_available() else None),
|
|
318
318
|
("rwkv", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None),
|
|
319
319
|
("seamless_m4t", "SeamlessM4TTokenizer" if is_tokenizers_available() else None),
|
|
320
320
|
("seamless_m4t_v2", "SeamlessM4TTokenizer" if is_tokenizers_available() else None),
|
|
321
321
|
("shieldgemma2", "GemmaTokenizerFast" if is_tokenizers_available() else None),
|
|
322
322
|
("siglip", "SiglipTokenizer" if is_sentencepiece_available() else None),
|
|
323
323
|
("siglip2", "GemmaTokenizerFast" if is_tokenizers_available() else None),
|
|
324
|
-
("smollm3", "
|
|
324
|
+
("smollm3", "TokenizersBackend" if is_tokenizers_available() else None),
|
|
325
325
|
("speech_to_text", "Speech2TextTokenizer" if is_sentencepiece_available() else None),
|
|
326
326
|
("speecht5", "SpeechT5Tokenizer" if is_sentencepiece_available() else None),
|
|
327
327
|
("splinter", "SplinterTokenizer"),
|
|
@@ -336,16 +336,16 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
|
|
|
336
336
|
("tvp", "BertTokenizer" if is_tokenizers_available() else None),
|
|
337
337
|
("udop", "UdopTokenizer" if is_tokenizers_available() else None),
|
|
338
338
|
("umt5", "T5Tokenizer" if is_tokenizers_available() else None),
|
|
339
|
-
("video_llava", "
|
|
339
|
+
("video_llava", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
340
340
|
("vilt", "BertTokenizer" if is_tokenizers_available() else None),
|
|
341
|
-
("vipllava", "
|
|
341
|
+
("vipllava", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
342
342
|
("visual_bert", "BertTokenizer" if is_tokenizers_available() else None),
|
|
343
343
|
("vits", "VitsTokenizer"),
|
|
344
344
|
(
|
|
345
345
|
"voxtral",
|
|
346
346
|
"MistralCommonBackend"
|
|
347
347
|
if is_mistral_common_available()
|
|
348
|
-
else ("
|
|
348
|
+
else ("LlamaTokenizer" if is_tokenizers_available() else None),
|
|
349
349
|
),
|
|
350
350
|
("wav2vec2", "Wav2Vec2CTCTokenizer"),
|
|
351
351
|
("wav2vec2-bert", "Wav2Vec2CTCTokenizer"),
|
|
@@ -361,8 +361,8 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]](
|
|
|
361
361
|
("xlstm", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None),
|
|
362
362
|
("xmod", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None),
|
|
363
363
|
("yoso", "AlbertTokenizer" if is_tokenizers_available() else None),
|
|
364
|
-
("zamba", "
|
|
365
|
-
("zamba2", "
|
|
364
|
+
("zamba", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
365
|
+
("zamba2", "LlamaTokenizer" if is_tokenizers_available() else None),
|
|
366
366
|
]
|
|
367
367
|
)
|
|
368
368
|
|
|
@@ -389,13 +389,17 @@ def load_merges(merges_file):
|
|
|
389
389
|
|
|
390
390
|
|
|
391
391
|
def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]:
|
|
392
|
+
# Bloom tokenizer classes were removed but should map to the fast backend for BC
|
|
393
|
+
if class_name in {"BloomTokenizer", "BloomTokenizerFast"}:
|
|
394
|
+
return TokenizersBackend
|
|
395
|
+
|
|
392
396
|
if class_name in REGISTERED_FAST_ALIASES:
|
|
393
397
|
return REGISTERED_FAST_ALIASES[class_name]
|
|
394
398
|
|
|
395
399
|
if class_name in REGISTERED_TOKENIZER_CLASSES:
|
|
396
400
|
return REGISTERED_TOKENIZER_CLASSES[class_name]
|
|
397
401
|
|
|
398
|
-
if class_name == "
|
|
402
|
+
if class_name == "TokenizersBackend":
|
|
399
403
|
return TokenizersBackend
|
|
400
404
|
|
|
401
405
|
# V5: TOKENIZER_MAPPING_NAMES now maps to single strings, not tuples
|
|
@@ -404,7 +408,7 @@ def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]:
|
|
|
404
408
|
module_name = model_type_to_module_name(module_name)
|
|
405
409
|
if (
|
|
406
410
|
module_name in ["mistral", "mistral3", "mixtral", "ministral", "ministral3", "pixtral", "voxtral"]
|
|
407
|
-
and class_name == "
|
|
411
|
+
and class_name == "MistralCommonBackend"
|
|
408
412
|
):
|
|
409
413
|
module = importlib.import_module(".tokenization_mistral_common", "transformers")
|
|
410
414
|
else:
|
|
@@ -428,402 +432,6 @@ def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]:
|
|
|
428
432
|
return None
|
|
429
433
|
|
|
430
434
|
|
|
431
|
-
def _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs):
|
|
432
|
-
# Delegate to shared helper to avoid duplication
|
|
433
|
-
return find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs)
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
def _load_tokenizers_backend(tokenizer_class, pretrained_model_name_or_path, inputs, kwargs):
|
|
437
|
-
"""
|
|
438
|
-
Load a tokenizer using only the tokenizers backend (no SentencePiece fallback).
|
|
439
|
-
|
|
440
|
-
This function attempts to load with the following priority:
|
|
441
|
-
1. If tokenizer.json exists, load directly
|
|
442
|
-
2. If any .model file (SPM) exists, try extracting vocab and merges
|
|
443
|
-
3. If vocab.json and merges.txt exist, load with those
|
|
444
|
-
4. If vocab.txt exists (WordPiece models), load with that
|
|
445
|
-
|
|
446
|
-
Args:
|
|
447
|
-
tokenizer_class: The tokenizer class to instantiate
|
|
448
|
-
pretrained_model_name_or_path: Path or model id
|
|
449
|
-
inputs: Additional positional arguments for tokenizer init
|
|
450
|
-
kwargs: Additional keyword arguments
|
|
451
|
-
|
|
452
|
-
Returns:
|
|
453
|
-
An instantiated tokenizer object
|
|
454
|
-
|
|
455
|
-
Raises:
|
|
456
|
-
ValueError: If tokenizer could not be loaded with tokenizers backend
|
|
457
|
-
"""
|
|
458
|
-
files_loaded = []
|
|
459
|
-
|
|
460
|
-
# Try tokenizer.json first
|
|
461
|
-
try:
|
|
462
|
-
tokenizer_json_exists = has_file(
|
|
463
|
-
pretrained_model_name_or_path,
|
|
464
|
-
"tokenizer.json",
|
|
465
|
-
revision=kwargs.get("revision"),
|
|
466
|
-
token=kwargs.get("token"),
|
|
467
|
-
cache_dir=kwargs.get("cache_dir"),
|
|
468
|
-
local_files_only=kwargs.get("local_files_only", False),
|
|
469
|
-
)
|
|
470
|
-
except Exception:
|
|
471
|
-
tokenizer_json_exists = False
|
|
472
|
-
|
|
473
|
-
if tokenizer_json_exists:
|
|
474
|
-
files_loaded.append("tokenizer.json")
|
|
475
|
-
kwargs["backend"] = "tokenizers"
|
|
476
|
-
kwargs["files_loaded"] = files_loaded
|
|
477
|
-
# Some old models have uploaded a tokenizer.json but haven't updated tokenizer_config.json to point to the correct tokenizer class
|
|
478
|
-
tokenizer_class = (
|
|
479
|
-
TokenizersBackend
|
|
480
|
-
if tokenizer_class.__name__ in ("PythonBackend", "PreTrainedTokenizer")
|
|
481
|
-
else tokenizer_class
|
|
482
|
-
)
|
|
483
|
-
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
|
484
|
-
|
|
485
|
-
# Try tekken.json (Mistral format)
|
|
486
|
-
try:
|
|
487
|
-
if has_file(
|
|
488
|
-
pretrained_model_name_or_path,
|
|
489
|
-
"tekken.json",
|
|
490
|
-
revision=kwargs.get("revision"),
|
|
491
|
-
token=kwargs.get("token"),
|
|
492
|
-
cache_dir=kwargs.get("cache_dir"),
|
|
493
|
-
local_files_only=kwargs.get("local_files_only", False),
|
|
494
|
-
):
|
|
495
|
-
from ...integrations.mistral import convert_tekken_tokenizer
|
|
496
|
-
|
|
497
|
-
tekken_file = cached_file(
|
|
498
|
-
pretrained_model_name_or_path,
|
|
499
|
-
"tekken.json",
|
|
500
|
-
**{
|
|
501
|
-
k: v
|
|
502
|
-
for k, v in kwargs.items()
|
|
503
|
-
if k
|
|
504
|
-
in ["cache_dir", "force_download", "proxies", "token", "revision", "local_files_only", "subfolder"]
|
|
505
|
-
},
|
|
506
|
-
)
|
|
507
|
-
if tekken_file is not None:
|
|
508
|
-
files_loaded.append("tekken.json")
|
|
509
|
-
kwargs["backend"] = "tokenizers"
|
|
510
|
-
kwargs["files_loaded"] = files_loaded
|
|
511
|
-
return convert_tekken_tokenizer(tekken_file)
|
|
512
|
-
except (ImportError, Exception):
|
|
513
|
-
pass
|
|
514
|
-
|
|
515
|
-
# Try extracting from SentencePiece model
|
|
516
|
-
spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs)
|
|
517
|
-
if spm_file is not None:
|
|
518
|
-
try:
|
|
519
|
-
resolved_spm = cached_file(
|
|
520
|
-
pretrained_model_name_or_path,
|
|
521
|
-
spm_file,
|
|
522
|
-
cache_dir=kwargs.get("cache_dir"),
|
|
523
|
-
force_download=kwargs.get("force_download", False),
|
|
524
|
-
proxies=kwargs.get("proxies"),
|
|
525
|
-
token=kwargs.get("token"),
|
|
526
|
-
revision=kwargs.get("revision"),
|
|
527
|
-
local_files_only=kwargs.get("local_files_only", False),
|
|
528
|
-
subfolder=kwargs.get("subfolder", ""),
|
|
529
|
-
)
|
|
530
|
-
except Exception:
|
|
531
|
-
resolved_spm = None
|
|
532
|
-
|
|
533
|
-
if resolved_spm is not None:
|
|
534
|
-
try:
|
|
535
|
-
from ...tokenization_utils_sentencepiece import SentencePieceExtractor
|
|
536
|
-
|
|
537
|
-
fast_sig = inspect.signature(getattr(tokenizer_class, "__init__", tokenizer_class))
|
|
538
|
-
if "vocab" in fast_sig.parameters:
|
|
539
|
-
try:
|
|
540
|
-
vocab_ids, vocab_scores, merges = SentencePieceExtractor(resolved_spm).extract()
|
|
541
|
-
files_loaded.append(spm_file)
|
|
542
|
-
kwargs["backend"] = "tokenizers"
|
|
543
|
-
kwargs["files_loaded"] = files_loaded
|
|
544
|
-
# If tokenizer needs both vocab and merges (BPE models)
|
|
545
|
-
if "merges" in fast_sig.parameters:
|
|
546
|
-
return tokenizer_class.from_pretrained(
|
|
547
|
-
pretrained_model_name_or_path, *inputs, vocab=vocab_scores, merges=merges, **kwargs
|
|
548
|
-
)
|
|
549
|
-
# If tokenizer only needs vocab (Unigram models like NLLB, SeamlessM4T)
|
|
550
|
-
else:
|
|
551
|
-
return tokenizer_class.from_pretrained(
|
|
552
|
-
pretrained_model_name_or_path, *inputs, vocab=vocab_scores, **kwargs
|
|
553
|
-
)
|
|
554
|
-
except Exception:
|
|
555
|
-
pass
|
|
556
|
-
except ImportError as e:
|
|
557
|
-
if "sentencepiece" in str(e).lower() or "SentencePiece" in str(e):
|
|
558
|
-
raise ImportError(
|
|
559
|
-
f"This checkpoint only contains a SentencePiece model file ({spm_file}), but the `sentencepiece` library is not installed. "
|
|
560
|
-
f"Please install sentencepiece to load this tokenizer: `pip install sentencepiece`"
|
|
561
|
-
) from e
|
|
562
|
-
raise
|
|
563
|
-
except Exception:
|
|
564
|
-
pass
|
|
565
|
-
|
|
566
|
-
vocab, merges, loaded = load_vocab_and_merges(pretrained_model_name_or_path, **kwargs)
|
|
567
|
-
if vocab is not None:
|
|
568
|
-
files_loaded.extend(loaded)
|
|
569
|
-
if issubclass(tokenizer_class, PreTrainedTokenizer):
|
|
570
|
-
kwargs["backend"] = "python"
|
|
571
|
-
else:
|
|
572
|
-
kwargs["backend"] = "tokenizers"
|
|
573
|
-
kwargs["files_loaded"] = files_loaded
|
|
574
|
-
if merges is not None:
|
|
575
|
-
return tokenizer_class.from_pretrained(
|
|
576
|
-
pretrained_model_name_or_path, *inputs, vocab=vocab, merges=merges, **kwargs
|
|
577
|
-
)
|
|
578
|
-
else:
|
|
579
|
-
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, vocab=vocab, **kwargs)
|
|
580
|
-
|
|
581
|
-
# Try vocab.txt (WordPiece models like SplinterTokenizer)
|
|
582
|
-
try:
|
|
583
|
-
resolved_vocab_txt = cached_file(
|
|
584
|
-
pretrained_model_name_or_path,
|
|
585
|
-
"vocab.txt",
|
|
586
|
-
cache_dir=kwargs.get("cache_dir"),
|
|
587
|
-
force_download=kwargs.get("force_download", False),
|
|
588
|
-
proxies=kwargs.get("proxies"),
|
|
589
|
-
token=kwargs.get("token"),
|
|
590
|
-
revision=kwargs.get("revision"),
|
|
591
|
-
local_files_only=kwargs.get("local_files_only", False),
|
|
592
|
-
subfolder=kwargs.get("subfolder", ""),
|
|
593
|
-
)
|
|
594
|
-
except Exception:
|
|
595
|
-
resolved_vocab_txt = None
|
|
596
|
-
|
|
597
|
-
if resolved_vocab_txt is not None:
|
|
598
|
-
try:
|
|
599
|
-
fast_sig = inspect.signature(getattr(tokenizer_class, "__init__", tokenizer_class))
|
|
600
|
-
if "vocab" in fast_sig.parameters:
|
|
601
|
-
# Load vocab.txt: each line is a token, line number is the ID
|
|
602
|
-
vocab = OrderedDict()
|
|
603
|
-
with open(resolved_vocab_txt, "r", encoding="utf-8") as reader:
|
|
604
|
-
tokens = reader.readlines()
|
|
605
|
-
for index, token in enumerate(tokens):
|
|
606
|
-
token = token.rstrip("\n")
|
|
607
|
-
vocab[token] = index
|
|
608
|
-
files_loaded.append("vocab.txt")
|
|
609
|
-
kwargs["backend"] = "tokenizers"
|
|
610
|
-
kwargs["files_loaded"] = files_loaded
|
|
611
|
-
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, vocab=vocab, **kwargs)
|
|
612
|
-
except Exception:
|
|
613
|
-
pass
|
|
614
|
-
|
|
615
|
-
# If all methods failed, raise an error
|
|
616
|
-
raise ValueError(
|
|
617
|
-
f"Could not load tokenizer from {pretrained_model_name_or_path} using tokenizers backend. "
|
|
618
|
-
"No tokenizer.json, tekken.json, vocab.json+merges.txt, vocab.txt, or compatible SentencePiece model found."
|
|
619
|
-
)
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
def _try_load_tokenizer_with_fallbacks(tokenizer_class, pretrained_model_name_or_path, inputs, kwargs):
|
|
623
|
-
"""
|
|
624
|
-
Try to load a tokenizer with backend selection.
|
|
625
|
-
|
|
626
|
-
This function routes to the appropriate backend based on the 'backend' parameter:
|
|
627
|
-
- "tokenizers" (default): Uses HuggingFace tokenizers library backend
|
|
628
|
-
- "sentencepiece": Uses SentencePiece backend
|
|
629
|
-
|
|
630
|
-
For the tokenizers backend, attempts to load with the following priority:
|
|
631
|
-
1. If tokenizer.json exists, load directly
|
|
632
|
-
2. If any .model file (SPM) exists, try extracting vocab and merges
|
|
633
|
-
3. If vocab.json and merges.txt exist, load with those
|
|
634
|
-
4. Fallback to SentencePieceBackend if available
|
|
635
|
-
|
|
636
|
-
Args:
|
|
637
|
-
tokenizer_class: The tokenizer class to instantiate (can be None)
|
|
638
|
-
pretrained_model_name_or_path: Path or model id
|
|
639
|
-
inputs: Additional positional arguments for tokenizer init
|
|
640
|
-
kwargs: Additional keyword arguments (may include 'backend' parameter, defaults to "tokenizers")
|
|
641
|
-
|
|
642
|
-
Returns:
|
|
643
|
-
An instantiated tokenizer object
|
|
644
|
-
|
|
645
|
-
Raises:
|
|
646
|
-
ValueError: If no tokenizer could be loaded
|
|
647
|
-
"""
|
|
648
|
-
# Extract the backend parameter - default to "tokenizers" to prioritize tokenizers backend
|
|
649
|
-
backend = kwargs.pop("backend", "tokenizers")
|
|
650
|
-
|
|
651
|
-
# Validate backend parameter
|
|
652
|
-
if backend not in ["sentencepiece", "tokenizers"]:
|
|
653
|
-
logger.warning(
|
|
654
|
-
f"Invalid backend '{backend}' specified. Valid options are 'tokenizers' or 'sentencepiece'. "
|
|
655
|
-
"Defaulting to 'tokenizers' backend."
|
|
656
|
-
)
|
|
657
|
-
backend = "tokenizers"
|
|
658
|
-
|
|
659
|
-
# Route to SentencePiece backend if requested
|
|
660
|
-
if backend == "sentencepiece":
|
|
661
|
-
if SentencePieceBackend is None:
|
|
662
|
-
raise ValueError(
|
|
663
|
-
"SentencePiece backend was requested but sentencepiece is not installed. "
|
|
664
|
-
"Please install it with: pip install sentencepiece"
|
|
665
|
-
)
|
|
666
|
-
logger.info("Loading tokenizer with SentencePiece backend")
|
|
667
|
-
# Track files loaded for SentencePiece backend
|
|
668
|
-
spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs)
|
|
669
|
-
files_loaded = [spm_file] if spm_file else []
|
|
670
|
-
kwargs["backend"] = "sentencepiece"
|
|
671
|
-
kwargs["files_loaded"] = files_loaded
|
|
672
|
-
# Resolve the SPM file path and pass it as vocab_file
|
|
673
|
-
if spm_file is not None:
|
|
674
|
-
resolved_vocab_file = cached_file(
|
|
675
|
-
pretrained_model_name_or_path,
|
|
676
|
-
spm_file,
|
|
677
|
-
cache_dir=kwargs.get("cache_dir"),
|
|
678
|
-
force_download=kwargs.get("force_download", False),
|
|
679
|
-
proxies=kwargs.get("proxies"),
|
|
680
|
-
token=kwargs.get("token"),
|
|
681
|
-
revision=kwargs.get("revision"),
|
|
682
|
-
local_files_only=kwargs.get("local_files_only", False),
|
|
683
|
-
subfolder=kwargs.get("subfolder", ""),
|
|
684
|
-
)
|
|
685
|
-
kwargs["vocab_file"] = resolved_vocab_file
|
|
686
|
-
if isinstance(tokenizer_class, type) and issubclass(tokenizer_class, SentencePieceBackend):
|
|
687
|
-
logger.info("Loading tokenizer with SentencePiece backend using tokenizer class")
|
|
688
|
-
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
|
689
|
-
return SentencePieceBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
|
690
|
-
|
|
691
|
-
# Route to tokenizers backend (default)
|
|
692
|
-
if backend == "tokenizers":
|
|
693
|
-
if tokenizer_class is not None:
|
|
694
|
-
# Check if tokenizer_class inherits from PreTrainedTokenizer (but not from TokenizersBackend/SentencePieceBackend)
|
|
695
|
-
# These are edge cases with custom logic (e.g., BioGptTokenizer with Moses tokenization)
|
|
696
|
-
from ...tokenization_python import PreTrainedTokenizer
|
|
697
|
-
|
|
698
|
-
# Build list of backend classes to check against
|
|
699
|
-
backend_classes = [TokenizersBackend] if TokenizersBackend else []
|
|
700
|
-
if SentencePieceBackend:
|
|
701
|
-
backend_classes.append(SentencePieceBackend)
|
|
702
|
-
|
|
703
|
-
# Check if it's a custom PreTrainedTokenizer (not a backend class)
|
|
704
|
-
is_custom_pre_trained = (
|
|
705
|
-
isinstance(tokenizer_class, type)
|
|
706
|
-
and issubclass(tokenizer_class, PreTrainedTokenizer)
|
|
707
|
-
and not any(issubclass(tokenizer_class, bc) for bc in backend_classes)
|
|
708
|
-
and tokenizer_class.__name__ not in ("PythonBackend", "PreTrainedTokenizer")
|
|
709
|
-
)
|
|
710
|
-
|
|
711
|
-
# Check if it's a completely custom tokenizer (not PreTrainedTokenizer, not backend class)
|
|
712
|
-
# e.g., MistralCommonBackend which has its own from_pretrained logic
|
|
713
|
-
inherits_from_backend = isinstance(tokenizer_class, type) and any(
|
|
714
|
-
bc and issubclass(tokenizer_class, bc) for bc in backend_classes
|
|
715
|
-
)
|
|
716
|
-
is_completely_custom = (
|
|
717
|
-
isinstance(tokenizer_class, type)
|
|
718
|
-
and not issubclass(tokenizer_class, PythonBackend)
|
|
719
|
-
and not inherits_from_backend
|
|
720
|
-
)
|
|
721
|
-
|
|
722
|
-
if is_custom_pre_trained:
|
|
723
|
-
logger.info("Loading tokenizer with custom PreTrainedTokenizer backend (edge case)")
|
|
724
|
-
# Track the backend type for custom tokenizers
|
|
725
|
-
kwargs["backend"] = "custom"
|
|
726
|
-
kwargs["files_loaded"] = [] # Custom tokenizers may load various files
|
|
727
|
-
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
|
728
|
-
|
|
729
|
-
if is_completely_custom:
|
|
730
|
-
# For completely custom tokenizers (like MistralCommonBackend), try calling from_pretrained directly
|
|
731
|
-
logger.info("Loading tokenizer with custom tokenizer class (non-PreTrainedTokenizer)")
|
|
732
|
-
# Filter out AutoTokenizer-specific kwargs that custom tokenizers don't accept
|
|
733
|
-
custom_kwargs = {k: v for k, v in kwargs.items() if k not in ["backend", "files_loaded"]}
|
|
734
|
-
custom_kwargs["_from_auto"] = True # Signal that this is called from AutoTokenizer
|
|
735
|
-
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **custom_kwargs)
|
|
736
|
-
|
|
737
|
-
if TokenizersBackend is None:
|
|
738
|
-
raise ValueError(
|
|
739
|
-
"Tokenizers backend is the default but tokenizers library is not installed. "
|
|
740
|
-
"Please install it with: pip install tokenizers"
|
|
741
|
-
)
|
|
742
|
-
logger.info("Loading tokenizer with tokenizers backend")
|
|
743
|
-
try:
|
|
744
|
-
return _load_tokenizers_backend(tokenizer_class, pretrained_model_name_or_path, inputs, kwargs)
|
|
745
|
-
except ValueError as e:
|
|
746
|
-
# If tokenizers backend fails, try falling back to SentencePiece backend if available
|
|
747
|
-
spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs)
|
|
748
|
-
if spm_file is not None and SentencePieceBackend is not None:
|
|
749
|
-
logger.info(
|
|
750
|
-
f"Tokenizers backend failed: {e}. "
|
|
751
|
-
f"Falling back to SentencePieceBackend since {spm_file} file was found."
|
|
752
|
-
)
|
|
753
|
-
files_loaded = [spm_file]
|
|
754
|
-
kwargs["backend"] = "sentencepiece"
|
|
755
|
-
kwargs["files_loaded"] = files_loaded
|
|
756
|
-
# Resolve the SPM file path and pass it as vocab_file
|
|
757
|
-
resolved_vocab_file = cached_file(
|
|
758
|
-
pretrained_model_name_or_path,
|
|
759
|
-
spm_file,
|
|
760
|
-
cache_dir=kwargs.get("cache_dir"),
|
|
761
|
-
force_download=kwargs.get("force_download", False),
|
|
762
|
-
proxies=kwargs.get("proxies"),
|
|
763
|
-
token=kwargs.get("token"),
|
|
764
|
-
revision=kwargs.get("revision"),
|
|
765
|
-
local_files_only=kwargs.get("local_files_only", False),
|
|
766
|
-
subfolder=kwargs.get("subfolder", ""),
|
|
767
|
-
)
|
|
768
|
-
kwargs["vocab_file"] = resolved_vocab_file
|
|
769
|
-
if tokenizer_class is not None and issubclass(tokenizer_class, SentencePieceBackend):
|
|
770
|
-
logger.info(
|
|
771
|
-
"Falling back to SentencePiece backend using tokenizer class that inherits from SentencePieceBackend."
|
|
772
|
-
)
|
|
773
|
-
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
|
774
|
-
return SentencePieceBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
|
775
|
-
# If no fallback available, try calling tokenizer class directly as last resort
|
|
776
|
-
if hasattr(tokenizer_class, "from_pretrained"):
|
|
777
|
-
logger.info(
|
|
778
|
-
f"Tokenizers backend failed: {e}. Trying to load tokenizer directly from tokenizer class."
|
|
779
|
-
)
|
|
780
|
-
# Filter out AutoTokenizer-specific kwargs that custom tokenizers don't accept
|
|
781
|
-
custom_kwargs = {k: v for k, v in kwargs.items() if k not in ["backend", "files_loaded"]}
|
|
782
|
-
custom_kwargs["_from_auto"] = True # Signal that this is called from AutoTokenizer
|
|
783
|
-
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **custom_kwargs)
|
|
784
|
-
# Re-raise if no fallback options available
|
|
785
|
-
raise
|
|
786
|
-
|
|
787
|
-
# If no tokenizer class but tokenizers backend requested, fall back to SentencePiece if available
|
|
788
|
-
spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs)
|
|
789
|
-
if spm_file is not None and SentencePieceBackend is not None:
|
|
790
|
-
logger.info(
|
|
791
|
-
f"Tokenizers backend was requested but no tokenizer class found. "
|
|
792
|
-
f"Falling back to SentencePieceBackend since {spm_file} file was found."
|
|
793
|
-
)
|
|
794
|
-
files_loaded = [spm_file]
|
|
795
|
-
kwargs["backend"] = "sentencepiece"
|
|
796
|
-
kwargs["files_loaded"] = files_loaded
|
|
797
|
-
# Resolve the SPM file path and pass it as vocab_file
|
|
798
|
-
resolved_vocab_file = cached_file(
|
|
799
|
-
pretrained_model_name_or_path,
|
|
800
|
-
spm_file,
|
|
801
|
-
cache_dir=kwargs.get("cache_dir"),
|
|
802
|
-
force_download=kwargs.get("force_download", False),
|
|
803
|
-
proxies=kwargs.get("proxies"),
|
|
804
|
-
token=kwargs.get("token"),
|
|
805
|
-
revision=kwargs.get("revision"),
|
|
806
|
-
local_files_only=kwargs.get("local_files_only", False),
|
|
807
|
-
subfolder=kwargs.get("subfolder", ""),
|
|
808
|
-
)
|
|
809
|
-
kwargs["vocab_file"] = resolved_vocab_file
|
|
810
|
-
if (
|
|
811
|
-
tokenizer_class is not None
|
|
812
|
-
and SentencePieceBackend is not None
|
|
813
|
-
and issubclass(tokenizer_class, SentencePieceBackend)
|
|
814
|
-
):
|
|
815
|
-
logger.info(
|
|
816
|
-
"Falling back to SentencePiece backend using tokenizer class that inherits from SentencePieceBackend."
|
|
817
|
-
)
|
|
818
|
-
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
|
819
|
-
return SentencePieceBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
|
820
|
-
|
|
821
|
-
raise ValueError(
|
|
822
|
-
f"Could not load tokenizer from {pretrained_model_name_or_path}. "
|
|
823
|
-
"No tokenizer class could be determined and no SentencePiece model found."
|
|
824
|
-
)
|
|
825
|
-
|
|
826
|
-
|
|
827
435
|
def get_tokenizer_config(
|
|
828
436
|
pretrained_model_name_or_path: Union[str, os.PathLike[str]],
|
|
829
437
|
cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
|
|
@@ -1084,7 +692,7 @@ class AutoTokenizer:
|
|
|
1084
692
|
|
|
1085
693
|
if (
|
|
1086
694
|
config_tokenizer_class is not None
|
|
1087
|
-
and config_tokenizer_class != "
|
|
695
|
+
and config_tokenizer_class != "TokenizersBackend"
|
|
1088
696
|
and "Fast" in config_tokenizer_class
|
|
1089
697
|
):
|
|
1090
698
|
config_tokenizer_class = config_tokenizer_class[:-4]
|
|
@@ -1125,10 +733,12 @@ class AutoTokenizer:
|
|
|
1125
733
|
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
|
|
1126
734
|
if tokenizer_class is None and not tokenizer_class_candidate.endswith("Fast"):
|
|
1127
735
|
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate + "Fast")
|
|
736
|
+
if tokenizer_class.__name__ == "PythonBackend": # unless you inherit from it?
|
|
737
|
+
tokenizer_class = TokenizersBackend
|
|
1128
738
|
else:
|
|
1129
739
|
tokenizer_class = fast_tokenizer_class
|
|
1130
740
|
|
|
1131
|
-
return
|
|
741
|
+
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
|
1132
742
|
|
|
1133
743
|
# Otherwise we have to be creative.
|
|
1134
744
|
# if model is an encoder decoder, the encoder tokenizer class is used by default
|
|
@@ -1144,17 +754,9 @@ class AutoTokenizer:
|
|
|
1144
754
|
|
|
1145
755
|
model_type = config_class_to_model_type(type(config).__name__)
|
|
1146
756
|
if model_type is not None:
|
|
1147
|
-
tokenizer_class = TOKENIZER_MAPPING
|
|
1148
|
-
|
|
757
|
+
tokenizer_class = TOKENIZER_MAPPING.get(type(config), TokenizersBackend)
|
|
1149
758
|
if tokenizer_class is not None:
|
|
1150
|
-
return
|
|
1151
|
-
tokenizer_class, pretrained_model_name_or_path, inputs, kwargs
|
|
1152
|
-
)
|
|
1153
|
-
else:
|
|
1154
|
-
raise ValueError(
|
|
1155
|
-
"This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
|
|
1156
|
-
"in order to use this tokenizer."
|
|
1157
|
-
)
|
|
759
|
+
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
|
1158
760
|
|
|
1159
761
|
raise ValueError(
|
|
1160
762
|
f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
|