transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -20,18 +20,19 @@ import copy
|
|
|
20
20
|
import json
|
|
21
21
|
import os
|
|
22
22
|
from collections import defaultdict
|
|
23
|
+
from collections.abc import Iterable
|
|
23
24
|
from shutil import copyfile
|
|
24
25
|
from typing import Any, Optional, Union
|
|
25
26
|
|
|
26
27
|
import tokenizers.pre_tokenizers as pre_tokenizers_fast
|
|
28
|
+
from huggingface_hub import is_offline_mode
|
|
27
29
|
from tokenizers import AddedToken, processors
|
|
28
30
|
from tokenizers import Encoding as EncodingFast
|
|
29
31
|
from tokenizers import Tokenizer as TokenizerFast
|
|
30
|
-
from tokenizers import normalizers as tokenizers_normalizers
|
|
31
32
|
from tokenizers.decoders import Decoder as DecoderFast
|
|
33
|
+
from tokenizers.models import BPE, Unigram
|
|
32
34
|
from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
|
|
33
35
|
|
|
34
|
-
from .convert_slow_tokenizer import convert_slow_tokenizer
|
|
35
36
|
from .integrations.ggml import convert_gguf_tokenizer
|
|
36
37
|
from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
|
|
37
38
|
from .tokenization_utils_base import (
|
|
@@ -41,8 +42,9 @@ from .tokenization_utils_base import (
|
|
|
41
42
|
PreTrainedTokenizerBase,
|
|
42
43
|
TextInput,
|
|
43
44
|
TruncationStrategy,
|
|
45
|
+
generate_merges,
|
|
44
46
|
)
|
|
45
|
-
from .utils import PaddingStrategy, add_end_docstrings,
|
|
47
|
+
from .utils import PaddingStrategy, add_end_docstrings, logging
|
|
46
48
|
|
|
47
49
|
|
|
48
50
|
logger = logging.get_logger(__name__)
|
|
@@ -90,26 +92,162 @@ class TokenizersBackend(PreTrainedTokenizerBase):
|
|
|
90
92
|
"""
|
|
91
93
|
|
|
92
94
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
95
|
+
model = None
|
|
96
|
+
_tokenizer = None
|
|
97
|
+
|
|
98
|
+
@classmethod
|
|
99
|
+
def convert_to_native_format(cls, trust_remote_code=False, **kwargs):
|
|
100
|
+
"""s
|
|
101
|
+
Build a `tokenizers.Tokenizer` backend from the available serialization files (tokenizer.json, sentencepiece
|
|
102
|
+
models, tekken.json, vocab/merges).
|
|
103
|
+
"""
|
|
104
|
+
# Preserve kwargs for possible downstream use
|
|
105
|
+
local_kwargs = dict(kwargs)
|
|
106
|
+
fast_tokenizer_file = local_kwargs.pop("tokenizer_file", None)
|
|
107
|
+
|
|
108
|
+
if (
|
|
109
|
+
fast_tokenizer_file is not None
|
|
110
|
+
and os.path.isfile(fast_tokenizer_file)
|
|
111
|
+
and (cls is TokenizersBackend or "__init__" not in cls.__dict__ or trust_remote_code)
|
|
112
|
+
):
|
|
113
|
+
local_kwargs["tokenizer_object"] = TokenizerFast.from_file(fast_tokenizer_file)
|
|
114
|
+
return local_kwargs
|
|
115
|
+
elif fast_tokenizer_file is not None and os.path.isfile(fast_tokenizer_file):
|
|
116
|
+
# we extract vocab / merges from the tokenizer file to pass them to __init__
|
|
117
|
+
processor = TokenizerFast.from_file(fast_tokenizer_file).post_processor
|
|
118
|
+
with open(fast_tokenizer_file, encoding="utf-8") as tokenizer_handle:
|
|
119
|
+
tokenizer_json = json.load(tokenizer_handle)
|
|
120
|
+
vocab = tokenizer_json.get("model", {}).get("vocab", None)
|
|
121
|
+
if cls.model is None:
|
|
122
|
+
if isinstance(vocab, list):
|
|
123
|
+
vocab = list(map(tuple, vocab)) # TODO just for now
|
|
124
|
+
elif cls.model.__name__ == "Unigram":
|
|
125
|
+
if vocab and isinstance(vocab[0], (list, tuple)):
|
|
126
|
+
vocab = [tuple(item) for item in vocab]
|
|
127
|
+
elif cls.model.__name__ == "WordLevel":
|
|
128
|
+
vocab = {token: i for i, token in enumerate(vocab)}
|
|
129
|
+
elif cls.model.__name__ == "BPE" or cls.model.__name__ == "WordPiece":
|
|
130
|
+
if isinstance(vocab, list):
|
|
131
|
+
vocab = {token[0] if isinstance(token, list) else token: i for i, token in enumerate(vocab)}
|
|
132
|
+
local_kwargs["vocab"] = vocab
|
|
133
|
+
|
|
134
|
+
model_type = getattr(cls, "model", None)
|
|
135
|
+
if "merges" in tokenizer_json.get("model", {}) and (model_type and model_type.__name__ == "BPE"):
|
|
136
|
+
merges = tokenizer_json["model"]["merges"]
|
|
137
|
+
merges = [tuple(merge.split(" ")) if isinstance(merge, str) else tuple(merge) for merge in merges]
|
|
138
|
+
local_kwargs["merges"] = merges
|
|
139
|
+
|
|
140
|
+
if processor is not None:
|
|
141
|
+
local_kwargs["post_processor"] = processor
|
|
142
|
+
return local_kwargs
|
|
143
|
+
|
|
144
|
+
vocab_file = local_kwargs.get("vocab_file")
|
|
145
|
+
merges_file = local_kwargs.get("merges_file")
|
|
146
|
+
vocab = local_kwargs.get("vocab")
|
|
147
|
+
merges = local_kwargs.get("merges")
|
|
148
|
+
|
|
149
|
+
# Tekken converter (Mistral)
|
|
150
|
+
if isinstance(vocab_file, str) and vocab_file.endswith("tekken.json") and os.path.isfile(vocab_file):
|
|
151
|
+
from .convert_slow_tokenizer import MistralConverter
|
|
152
|
+
|
|
153
|
+
local_kwargs["vocab"], local_kwargs["merges"] = MistralConverter(
|
|
154
|
+
vocab_file=vocab_file
|
|
155
|
+
).extract_vocab_merges_from_model(vocab_file)
|
|
156
|
+
return local_kwargs
|
|
157
|
+
|
|
158
|
+
# SentencePiece model (with TikToken fallback)
|
|
159
|
+
if isinstance(vocab_file, str) and os.path.isfile(vocab_file) and vocab_file.endswith(".model"):
|
|
160
|
+
try:
|
|
161
|
+
from .convert_slow_tokenizer import SentencePieceExtractor
|
|
162
|
+
|
|
163
|
+
local_kwargs = SentencePieceExtractor(vocab_file).extract(cls.model, **local_kwargs)
|
|
164
|
+
try:
|
|
165
|
+
from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
|
|
166
|
+
|
|
167
|
+
converter_class = SLOW_TO_FAST_CONVERTERS.get(cls.__name__)
|
|
168
|
+
if converter_class is not None and hasattr(converter_class, "convert_from_spm"):
|
|
169
|
+
local_kwargs = converter_class.convert_from_spm(**local_kwargs)
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.warning(
|
|
172
|
+
f"Could not reorder vocab using converter for {cls.__name__} due to {e}. Falling back to raw SentencePiece extraction."
|
|
173
|
+
)
|
|
174
|
+
# what used to be in `convert_slow`
|
|
175
|
+
if hasattr(cls, "convert_from_spm_model"):
|
|
176
|
+
local_kwargs = cls.convert_from_spm_model(**local_kwargs)
|
|
177
|
+
except Exception as e: # TODO only catch deserialization error here!
|
|
178
|
+
logger.warning(
|
|
179
|
+
f"Could not extract SentencePiece model from {vocab_file} using sentencepiece library due to {e}. "
|
|
180
|
+
"Falling back to TikToken extractor."
|
|
181
|
+
)
|
|
182
|
+
from .convert_slow_tokenizer import TikTokenConverter
|
|
183
|
+
|
|
184
|
+
local_kwargs["vocab"], local_kwargs["merges"] = TikTokenConverter(
|
|
185
|
+
vocab_file=vocab_file, extra_special_tokens=local_kwargs.get("extra_special_tokens")
|
|
186
|
+
).extract_vocab_merges_from_model(vocab_file)
|
|
187
|
+
|
|
188
|
+
return local_kwargs
|
|
189
|
+
|
|
190
|
+
# Fallback to standard vocab/merges files if they existed!
|
|
191
|
+
if vocab is None and isinstance(vocab_file, str) and os.path.isfile(vocab_file):
|
|
192
|
+
local_kwargs["vocab"] = vocab_file
|
|
193
|
+
vocab = local_kwargs["vocab"]
|
|
194
|
+
if merges is None and isinstance(merges_file, str) and os.path.isfile(merges_file):
|
|
195
|
+
local_kwargs["merges"] = merges_file
|
|
196
|
+
merges = local_kwargs["merges"]
|
|
197
|
+
|
|
198
|
+
# Generate merges automatically when not provided for BPE tokenizers
|
|
199
|
+
if merges is None and cls.model is not None and cls.model.__name__ == "BPE" and isinstance(vocab, dict):
|
|
200
|
+
# Gather special tokens from kwargs to skip in merge generation
|
|
201
|
+
def _iter_special_tokens(values: Iterable[Any]) -> list[str]:
|
|
202
|
+
collected: list[str] = []
|
|
203
|
+
for val in values:
|
|
204
|
+
if val is None:
|
|
205
|
+
continue
|
|
206
|
+
if isinstance(val, (list, tuple)):
|
|
207
|
+
collected.extend(_iter_special_tokens(val))
|
|
208
|
+
else:
|
|
209
|
+
collected.append(str(val))
|
|
210
|
+
return collected
|
|
211
|
+
|
|
212
|
+
special_tokens_keys = [
|
|
213
|
+
"pad_token",
|
|
214
|
+
"unk_token",
|
|
215
|
+
"bos_token",
|
|
216
|
+
"eos_token",
|
|
217
|
+
"sep_token",
|
|
218
|
+
"cls_token",
|
|
219
|
+
"mask_token",
|
|
220
|
+
"additional_special_tokens",
|
|
221
|
+
"extra_special_tokens",
|
|
222
|
+
]
|
|
223
|
+
skip_tokens: set[str] = set()
|
|
224
|
+
for key in special_tokens_keys:
|
|
225
|
+
if key in local_kwargs:
|
|
226
|
+
skip_tokens.update(_iter_special_tokens([local_kwargs[key]]))
|
|
227
|
+
|
|
228
|
+
merges = generate_merges(vocab, skip_tokens=skip_tokens)
|
|
229
|
+
local_kwargs["merges"] = merges
|
|
230
|
+
return local_kwargs
|
|
93
231
|
|
|
94
232
|
def __init__(self, *args, **kwargs):
|
|
95
233
|
tokenizer_object = kwargs.pop("tokenizer_object", None)
|
|
96
|
-
slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
|
|
97
234
|
gguf_file = kwargs.pop("gguf_file", None)
|
|
98
235
|
fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
|
|
99
|
-
from_slow = kwargs.pop("from_slow", False)
|
|
100
236
|
# Note: added_tokens_decoder is NOT popped - it's passed to super().__init__() for processing
|
|
101
237
|
added_tokens_decoder = kwargs.get("added_tokens_decoder", {})
|
|
102
238
|
# Store add_prefix_space before super().__init__() to ensure it's not overridden
|
|
103
239
|
add_prefix_space = kwargs.get("add_prefix_space", False)
|
|
240
|
+
vocab_file = kwargs.get("vocab_file")
|
|
241
|
+
|
|
242
|
+
vocab = kwargs.get("vocab")
|
|
243
|
+
merges = kwargs.get("merges")
|
|
104
244
|
|
|
245
|
+
fast_tokenizer = None
|
|
105
246
|
if tokenizer_object is not None:
|
|
106
247
|
fast_tokenizer = copy.deepcopy(tokenizer_object)
|
|
107
|
-
elif fast_tokenizer_file is not None and
|
|
248
|
+
elif fast_tokenizer_file is not None and os.path.isfile(fast_tokenizer_file):
|
|
108
249
|
# We have a serialization from tokenizers which let us directly build the backend
|
|
109
250
|
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
|
|
110
|
-
elif slow_tokenizer:
|
|
111
|
-
# We need to convert a slow tokenizer to build the backend
|
|
112
|
-
fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
|
|
113
251
|
elif gguf_file is not None:
|
|
114
252
|
# We need to convert a slow tokenizer to build the backend
|
|
115
253
|
gguf_param = load_gguf_checkpoint(kwargs.get("vocab_file"))
|
|
@@ -120,18 +258,16 @@ class TokenizersBackend(PreTrainedTokenizerBase):
|
|
|
120
258
|
kwargs.update(tokenizer_config)
|
|
121
259
|
if len(additional_kwargs) > 0:
|
|
122
260
|
kwargs.update(additional_kwargs)
|
|
123
|
-
elif self.
|
|
124
|
-
#
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
slow_tokenizer = None
|
|
134
|
-
else:
|
|
261
|
+
elif self._tokenizer is None and vocab is not None:
|
|
262
|
+
# Build from vocab/merges extracted by convert_to_native_format
|
|
263
|
+
if merges is not None:
|
|
264
|
+
vocab_dict = vocab if isinstance(vocab, dict) else {w: i for i, (w, _) in enumerate(vocab)}
|
|
265
|
+
fast_tokenizer = TokenizerFast(BPE(vocab=vocab_dict, merges=merges, fuse_unk=True, dropout=None))
|
|
266
|
+
elif isinstance(vocab, dict):
|
|
267
|
+
fast_tokenizer = TokenizerFast(BPE(vocab=vocab, merges=[], fuse_unk=True, dropout=None))
|
|
268
|
+
elif isinstance(vocab, list) and vocab and isinstance(vocab[0], (tuple, list)):
|
|
269
|
+
fast_tokenizer = TokenizerFast(Unigram(vocab=vocab, unk_id=kwargs.get("unk_id", 0)))
|
|
270
|
+
elif self._tokenizer is None:
|
|
135
271
|
raise ValueError(
|
|
136
272
|
"Couldn't instantiate the backend tokenizer from one of: \n"
|
|
137
273
|
"(1) a `tokenizers` library serialization file, \n"
|
|
@@ -139,11 +275,16 @@ class TokenizersBackend(PreTrainedTokenizerBase):
|
|
|
139
275
|
"(3) an equivalent slow tokenizer class to instantiate and convert. \n"
|
|
140
276
|
"You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one."
|
|
141
277
|
)
|
|
278
|
+
# Only set defaults when creating TokenizersBackend from scratch
|
|
279
|
+
if fast_tokenizer_file is None and tokenizer_object is None and self._tokenizer is None:
|
|
280
|
+
kwargs.setdefault("bos_token", "<s>")
|
|
281
|
+
kwargs.setdefault("eos_token", "</s>")
|
|
142
282
|
|
|
143
|
-
|
|
283
|
+
if fast_tokenizer is not None:
|
|
284
|
+
self._tokenizer = fast_tokenizer
|
|
144
285
|
|
|
145
|
-
if
|
|
146
|
-
|
|
286
|
+
if self._tokenizer is None:
|
|
287
|
+
raise ValueError("The backend tokenizer is not correctly initialized.")
|
|
147
288
|
|
|
148
289
|
_truncation = self._tokenizer.truncation
|
|
149
290
|
|
|
@@ -169,8 +310,17 @@ class TokenizersBackend(PreTrainedTokenizerBase):
|
|
|
169
310
|
if "backend" not in kwargs:
|
|
170
311
|
kwargs["backend"] = "tokenizers"
|
|
171
312
|
|
|
313
|
+
explicit_bos_eos_in_kwargs = "add_bos_token" in kwargs or "add_eos_token" in kwargs
|
|
314
|
+
self._add_bos_token = kwargs.get("add_bos_token", False)
|
|
315
|
+
self._add_eos_token = kwargs.get("add_eos_token", False)
|
|
316
|
+
if post_processor := kwargs.pop("post_processor", None): # most reliable way to get the post-processor
|
|
317
|
+
self._tokenizer.post_processor = post_processor
|
|
318
|
+
self._should_update_post_processor = explicit_bos_eos_in_kwargs or self._tokenizer.post_processor is None
|
|
172
319
|
# We call this after having initialized the backend tokenizer because we update it.
|
|
173
320
|
super().__init__(**kwargs)
|
|
321
|
+
|
|
322
|
+
if vocab_file is not None:
|
|
323
|
+
self.vocab_file = vocab_file
|
|
174
324
|
# Ensure add_prefix_space is set correctly after parent init
|
|
175
325
|
self.add_prefix_space = add_prefix_space
|
|
176
326
|
self._tokenizer.encode_special_tokens = self.split_special_tokens
|
|
@@ -210,7 +360,7 @@ class TokenizersBackend(PreTrainedTokenizerBase):
|
|
|
210
360
|
tokens.append(token)
|
|
211
361
|
if tokens:
|
|
212
362
|
# These tokens are from the special tokens map
|
|
213
|
-
self.add_tokens(tokens
|
|
363
|
+
self.add_tokens(tokens)
|
|
214
364
|
|
|
215
365
|
try:
|
|
216
366
|
vocab_size = self._tokenizer.get_vocab_size()
|
|
@@ -228,6 +378,12 @@ class TokenizersBackend(PreTrainedTokenizerBase):
|
|
|
228
378
|
**kwargs,
|
|
229
379
|
)
|
|
230
380
|
|
|
381
|
+
self._should_update_post_processor = (
|
|
382
|
+
self._should_update_post_processor or self._tokenizer.post_processor is None
|
|
383
|
+
)
|
|
384
|
+
if self._should_update_post_processor:
|
|
385
|
+
self.update_post_processor()
|
|
386
|
+
|
|
231
387
|
@property
|
|
232
388
|
def is_fast(self) -> bool:
|
|
233
389
|
return True
|
|
@@ -273,7 +429,7 @@ class TokenizersBackend(PreTrainedTokenizerBase):
|
|
|
273
429
|
# If eos_token is None and add_eos_token is True, silently disable add_eos_token
|
|
274
430
|
# This allows tokenizers to set add_eos_token even if eos_token is not configured
|
|
275
431
|
if eos is None and self.add_eos_token:
|
|
276
|
-
self.
|
|
432
|
+
self.add_eos_token = False
|
|
277
433
|
return
|
|
278
434
|
|
|
279
435
|
single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
|
|
@@ -320,98 +476,24 @@ class TokenizersBackend(PreTrainedTokenizerBase):
|
|
|
320
476
|
if token_value is None:
|
|
321
477
|
continue
|
|
322
478
|
if isinstance(token_value, AddedToken):
|
|
323
|
-
|
|
324
|
-
tokens_to_add.append(token_value)
|
|
479
|
+
tokens_to_add.append(token_value)
|
|
325
480
|
elif isinstance(token_value, str):
|
|
326
|
-
|
|
327
|
-
tokens_to_add.append(AddedToken(token_value, special=True, normalized=False))
|
|
481
|
+
tokens_to_add.append(AddedToken(token_value, special=True, normalized=False))
|
|
328
482
|
|
|
329
483
|
# V5: Check extra special tokens
|
|
330
484
|
for token in self._extra_special_tokens:
|
|
331
485
|
if isinstance(token, AddedToken):
|
|
332
|
-
|
|
333
|
-
tokens_to_add.append(token)
|
|
486
|
+
tokens_to_add.append(token)
|
|
334
487
|
elif isinstance(token, str):
|
|
335
|
-
|
|
336
|
-
tokens_to_add.append(AddedToken(token, special=True, normalized=False))
|
|
488
|
+
tokens_to_add.append(AddedToken(token, special=True, normalized=False))
|
|
337
489
|
|
|
338
490
|
if tokens_to_add:
|
|
339
491
|
# Ensure special tokens are added as such to the backend
|
|
340
492
|
self.add_tokens(tokens_to_add, special_tokens=True)
|
|
341
493
|
|
|
342
|
-
if
|
|
494
|
+
if getattr(self, "_should_update_post_processor", True) or self._tokenizer.post_processor is None:
|
|
343
495
|
self.update_post_processor()
|
|
344
496
|
|
|
345
|
-
# Update add_prefix_space in the pre_tokenizer if needed
|
|
346
|
-
if hasattr(self, "add_prefix_space"):
|
|
347
|
-
try:
|
|
348
|
-
tokenizer_json = json.loads(self.backend_tokenizer.to_str())
|
|
349
|
-
pre_tok = tokenizer_json.get("pre_tokenizer", {})
|
|
350
|
-
|
|
351
|
-
# Recursively update add_prefix_space in pretokenizers
|
|
352
|
-
def update_add_prefix_space(pretok_dict, value):
|
|
353
|
-
updated = False
|
|
354
|
-
if pretok_dict.get("type") == "Sequence":
|
|
355
|
-
for nested in pretok_dict.get("pretokenizers", []):
|
|
356
|
-
updated |= update_add_prefix_space(nested, value)
|
|
357
|
-
elif "add_prefix_space" in pretok_dict and pretok_dict["add_prefix_space"] != value:
|
|
358
|
-
pretok_dict["add_prefix_space"] = value
|
|
359
|
-
updated = True
|
|
360
|
-
return updated
|
|
361
|
-
|
|
362
|
-
if update_add_prefix_space(pre_tok, self.add_prefix_space):
|
|
363
|
-
self._tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json))
|
|
364
|
-
except Exception:
|
|
365
|
-
pass
|
|
366
|
-
|
|
367
|
-
# Ensure normalizer flags (lowercase/accents/chinese chars) reflect tokenizer attributes
|
|
368
|
-
try:
|
|
369
|
-
normalizer = self.backend_tokenizer.normalizer
|
|
370
|
-
if normalizer is not None:
|
|
371
|
-
norm_state = json.loads(normalizer.__getstate__())
|
|
372
|
-
norm_type = norm_state.get("type")
|
|
373
|
-
|
|
374
|
-
desired_lowercase = getattr(self, "do_lower_case", None)
|
|
375
|
-
desired_strip_accents = getattr(self, "strip_accents", None)
|
|
376
|
-
# Some tokenizers expose keep_accents instead of strip_accents
|
|
377
|
-
if desired_strip_accents is None and hasattr(self, "keep_accents") and "strip_accents" in norm_state:
|
|
378
|
-
keep_accents_value = getattr(self, "keep_accents")
|
|
379
|
-
if keep_accents_value is not None:
|
|
380
|
-
desired_strip_accents = not keep_accents_value
|
|
381
|
-
desired_handle_chinese = getattr(self, "tokenize_chinese_chars", None)
|
|
382
|
-
|
|
383
|
-
updated = False
|
|
384
|
-
if (
|
|
385
|
-
desired_lowercase is not None
|
|
386
|
-
and "lowercase" in norm_state
|
|
387
|
-
and norm_state["lowercase"] != desired_lowercase
|
|
388
|
-
):
|
|
389
|
-
norm_state["lowercase"] = desired_lowercase
|
|
390
|
-
updated = True
|
|
391
|
-
if (
|
|
392
|
-
desired_strip_accents is not None
|
|
393
|
-
and "strip_accents" in norm_state
|
|
394
|
-
and norm_state["strip_accents"] != desired_strip_accents
|
|
395
|
-
):
|
|
396
|
-
norm_state["strip_accents"] = desired_strip_accents
|
|
397
|
-
updated = True
|
|
398
|
-
if (
|
|
399
|
-
desired_handle_chinese is not None
|
|
400
|
-
and "handle_chinese_chars" in norm_state
|
|
401
|
-
and norm_state["handle_chinese_chars"] != desired_handle_chinese
|
|
402
|
-
):
|
|
403
|
-
norm_state["handle_chinese_chars"] = desired_handle_chinese
|
|
404
|
-
updated = True
|
|
405
|
-
|
|
406
|
-
if updated and norm_type is not None:
|
|
407
|
-
norm_class = getattr(tokenizers_normalizers, norm_type, None)
|
|
408
|
-
if norm_class is not None:
|
|
409
|
-
norm_state.pop("type", None)
|
|
410
|
-
self.backend_tokenizer.normalizer = norm_class(**norm_state)
|
|
411
|
-
except Exception:
|
|
412
|
-
# Best-effort: do not block initialization on normalizer reconciliation
|
|
413
|
-
pass
|
|
414
|
-
|
|
415
497
|
@property
|
|
416
498
|
def vocab_size(self) -> int:
|
|
417
499
|
"""
|
|
@@ -839,6 +921,8 @@ class TokenizersBackend(PreTrainedTokenizerBase):
|
|
|
839
921
|
|
|
840
922
|
if isinstance(token_ids, int):
|
|
841
923
|
token_ids = [token_ids]
|
|
924
|
+
if isinstance(token_ids, dict):
|
|
925
|
+
token_ids = token_ids["input_ids"]
|
|
842
926
|
return self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
|
|
843
927
|
|
|
844
928
|
def _save_pretrained(
|
|
@@ -1132,7 +1216,7 @@ class TokenizersBackend(PreTrainedTokenizerBase):
|
|
|
1132
1216
|
]
|
|
1133
1217
|
):
|
|
1134
1218
|
return tokenizer
|
|
1135
|
-
elif transformers_version and version.parse(transformers_version)
|
|
1219
|
+
elif transformers_version and version.parse(transformers_version) > version.parse("4.57.3"):
|
|
1136
1220
|
return tokenizer
|
|
1137
1221
|
|
|
1138
1222
|
mistral_config_detected = True
|
transformers/trainer.py
CHANGED
|
@@ -642,6 +642,16 @@ class Trainer:
|
|
|
642
642
|
"You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method."
|
|
643
643
|
)
|
|
644
644
|
default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
|
|
645
|
+
|
|
646
|
+
# Add JIT checkpoint callback if enabled
|
|
647
|
+
if self.args.enable_jit_checkpoint:
|
|
648
|
+
from .trainer_jit_checkpoint import JITCheckpointCallback
|
|
649
|
+
|
|
650
|
+
jit_callback = JITCheckpointCallback()
|
|
651
|
+
default_callbacks = default_callbacks + [jit_callback]
|
|
652
|
+
# Set trainer reference for JIT callback after initialization
|
|
653
|
+
jit_callback.set_trainer(self)
|
|
654
|
+
|
|
645
655
|
callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
|
|
646
656
|
self.callback_handler = CallbackHandler(
|
|
647
657
|
callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler
|
|
@@ -1661,6 +1671,12 @@ class Trainer:
|
|
|
1661
1671
|
optimizer_cls = AdamW8bit
|
|
1662
1672
|
else:
|
|
1663
1673
|
raise ValueError("Invalid optimizer")
|
|
1674
|
+
optimizer_kwargs.update(
|
|
1675
|
+
{
|
|
1676
|
+
"block_size": optim_args.get("block_size", 256),
|
|
1677
|
+
"bf16_stochastic_round": strtobool(optim_args.get("bf16_stochastic_round", "False")),
|
|
1678
|
+
}
|
|
1679
|
+
)
|
|
1664
1680
|
optimizer_kwargs.update(adam_kwargs)
|
|
1665
1681
|
elif args.optim in [
|
|
1666
1682
|
OptimizerNames.SCHEDULE_FREE_RADAM,
|
|
@@ -2338,6 +2354,9 @@ class Trainer:
|
|
|
2338
2354
|
|
|
2339
2355
|
if self.is_fsdp_enabled:
|
|
2340
2356
|
self.model = self.model_wrapped = model
|
|
2357
|
+
# Fix `got mixed torch.Tensor and DTensor` error in model.generate() for FSDP2 with LoRA
|
|
2358
|
+
if hasattr(self.model, "generate"):
|
|
2359
|
+
dist.fsdp.register_fsdp_forward_method(self.model, "generate")
|
|
2341
2360
|
|
|
2342
2361
|
# for the rest of this function `model` is the outside model, whether it was wrapped or not
|
|
2343
2362
|
if model is not self.model:
|
|
@@ -2428,8 +2447,6 @@ class Trainer:
|
|
|
2428
2447
|
|
|
2429
2448
|
for epoch in range(epochs_trained, num_train_epochs):
|
|
2430
2449
|
epoch_dataloader = train_dataloader
|
|
2431
|
-
if hasattr(epoch_dataloader, "set_epoch"):
|
|
2432
|
-
epoch_dataloader.set_epoch(epoch)
|
|
2433
2450
|
|
|
2434
2451
|
steps_in_epoch = (
|
|
2435
2452
|
len(epoch_dataloader)
|
|
@@ -2450,6 +2467,9 @@ class Trainer:
|
|
|
2450
2467
|
elif steps_trained_in_current_epoch == 0:
|
|
2451
2468
|
self._load_rng_state(resume_from_checkpoint)
|
|
2452
2469
|
|
|
2470
|
+
if hasattr(epoch_dataloader, "set_epoch"):
|
|
2471
|
+
epoch_dataloader.set_epoch(epoch)
|
|
2472
|
+
|
|
2453
2473
|
epoch_iterator = iter(epoch_dataloader)
|
|
2454
2474
|
# We chunkify the epoch iterator into gradient accumulation steps `n` batches
|
|
2455
2475
|
remainder = steps_in_epoch % args.gradient_accumulation_steps
|
|
@@ -2788,7 +2808,7 @@ class Trainer:
|
|
|
2788
2808
|
)
|
|
2789
2809
|
else:
|
|
2790
2810
|
# We load the model state dict on the CPU to avoid an OOM error.
|
|
2791
|
-
if
|
|
2811
|
+
if os.path.isfile(safe_weights_file):
|
|
2792
2812
|
state_dict = safetensors.torch.load_file(safe_weights_file, device="cpu")
|
|
2793
2813
|
else:
|
|
2794
2814
|
check_torch_load_is_safe()
|
|
@@ -2828,9 +2848,7 @@ class Trainer:
|
|
|
2828
2848
|
logger.warning(f"Could not load adapter model, make sure to have PEFT >= {MIN_PEFT_VERSION} installed")
|
|
2829
2849
|
else:
|
|
2830
2850
|
# We load the sharded checkpoint
|
|
2831
|
-
load_result = load_sharded_checkpoint(
|
|
2832
|
-
model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled(), prefer_safe=self.args.save_safetensors
|
|
2833
|
-
)
|
|
2851
|
+
load_result = load_sharded_checkpoint(model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled())
|
|
2834
2852
|
if not is_sagemaker_mp_enabled():
|
|
2835
2853
|
self._issue_warnings_after_load(load_result)
|
|
2836
2854
|
|
|
@@ -2913,7 +2931,7 @@ class Trainer:
|
|
|
2913
2931
|
has_been_loaded = False
|
|
2914
2932
|
else:
|
|
2915
2933
|
# We load the model state dict on the CPU to avoid an OOM error.
|
|
2916
|
-
if
|
|
2934
|
+
if os.path.isfile(best_safe_model_path):
|
|
2917
2935
|
state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
|
|
2918
2936
|
else:
|
|
2919
2937
|
check_torch_load_is_safe()
|
|
@@ -3932,6 +3950,9 @@ class Trainer:
|
|
|
3932
3950
|
# Both standard transformer models and Liger-patched models handle shift_labels correctly,
|
|
3933
3951
|
# so we can directly use the computed loss from the model output.
|
|
3934
3952
|
# See: https://huggingface.co/docs/accelerate/en/concept_guides/sequence_parallelism
|
|
3953
|
+
if "labels" not in inputs and "shift_labels" in inputs:
|
|
3954
|
+
# DeepSpeed SP Dataloader removes "labels" but we need it, otherwise, we won't compute the loss.
|
|
3955
|
+
inputs["labels"] = inputs["shift_labels"]
|
|
3935
3956
|
outputs = model(**inputs)
|
|
3936
3957
|
loss = outputs.loss
|
|
3937
3958
|
|
|
@@ -4007,7 +4028,16 @@ class Trainer:
|
|
|
4007
4028
|
self._save(output_dir, state_dict=state_dict)
|
|
4008
4029
|
elif self.is_deepspeed_enabled:
|
|
4009
4030
|
try:
|
|
4010
|
-
|
|
4031
|
+
accept_exclude_frozen_parameters = "exclude_frozen_parameters" in set(
|
|
4032
|
+
inspect.signature(self.model_wrapped.save_checkpoint).parameters.keys()
|
|
4033
|
+
)
|
|
4034
|
+
zero3_sharding = self.deepspeed.config.get("zero_optimization", {}).get("stage", None) == 3
|
|
4035
|
+
if accept_exclude_frozen_parameters and _is_peft_model(self.model) and zero3_sharding:
|
|
4036
|
+
# When using PEFT with DeepSpeed ZeRO Stage 3,
|
|
4037
|
+
# we do not need to load the frozen parameters
|
|
4038
|
+
state_dict = self.deepspeed._zero3_consolidated_16bit_state_dict(exclude_frozen_parameters=True)
|
|
4039
|
+
else:
|
|
4040
|
+
state_dict = self.accelerator.get_state_dict(self.deepspeed)
|
|
4011
4041
|
if self.args.should_save:
|
|
4012
4042
|
self._save(output_dir, state_dict=state_dict)
|
|
4013
4043
|
except ValueError:
|
|
@@ -4067,12 +4097,7 @@ class Trainer:
|
|
|
4067
4097
|
model = model.module.module
|
|
4068
4098
|
unwrapped_model = self.accelerator.unwrap_model(model)
|
|
4069
4099
|
if isinstance(unwrapped_model, supported_classes):
|
|
4070
|
-
unwrapped_model.save_pretrained(
|
|
4071
|
-
output_dir,
|
|
4072
|
-
state_dict=full_state_dict,
|
|
4073
|
-
save_function=xm.save,
|
|
4074
|
-
safe_serialization=self.args.save_safetensors,
|
|
4075
|
-
)
|
|
4100
|
+
unwrapped_model.save_pretrained(output_dir, state_dict=full_state_dict)
|
|
4076
4101
|
else:
|
|
4077
4102
|
logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
|
|
4078
4103
|
xm.save(full_state_dict, os.path.join(output_dir, WEIGHTS_NAME))
|
|
@@ -4082,8 +4107,6 @@ class Trainer:
|
|
|
4082
4107
|
output_dir,
|
|
4083
4108
|
is_main_process=self.args.should_save,
|
|
4084
4109
|
state_dict=xm._maybe_convert_to_cpu(model.state_dict()),
|
|
4085
|
-
save_function=xm.save,
|
|
4086
|
-
safe_serialization=self.args.save_safetensors,
|
|
4087
4110
|
)
|
|
4088
4111
|
else:
|
|
4089
4112
|
logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
|
|
@@ -4093,8 +4116,6 @@ class Trainer:
|
|
|
4093
4116
|
model.save_pretrained(
|
|
4094
4117
|
output_dir,
|
|
4095
4118
|
is_main_process=self.args.should_save,
|
|
4096
|
-
save_function=xm.save,
|
|
4097
|
-
safe_serialization=self.args.save_safetensors,
|
|
4098
4119
|
state_dict=xm._maybe_convert_to_cpu(model.state_dict()),
|
|
4099
4120
|
)
|
|
4100
4121
|
if self.processing_class is not None and self.args.should_save:
|
|
@@ -4115,20 +4136,15 @@ class Trainer:
|
|
|
4115
4136
|
|
|
4116
4137
|
if isinstance(self.accelerator.unwrap_model(self.model, keep_torch_compile=False), supported_classes):
|
|
4117
4138
|
self.accelerator.unwrap_model(self.model, keep_torch_compile=False).save_pretrained(
|
|
4118
|
-
output_dir, state_dict=state_dict
|
|
4139
|
+
output_dir, state_dict=state_dict
|
|
4119
4140
|
)
|
|
4120
4141
|
else:
|
|
4121
4142
|
logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
|
|
4122
|
-
|
|
4123
|
-
|
|
4124
|
-
|
|
4125
|
-
)
|
|
4126
|
-
else:
|
|
4127
|
-
torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
|
|
4143
|
+
safetensors.torch.save_file(
|
|
4144
|
+
state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"}
|
|
4145
|
+
)
|
|
4128
4146
|
else:
|
|
4129
|
-
self.model.save_pretrained(
|
|
4130
|
-
output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
|
|
4131
|
-
)
|
|
4147
|
+
self.model.save_pretrained(output_dir, state_dict=state_dict)
|
|
4132
4148
|
|
|
4133
4149
|
if self.processing_class is not None:
|
|
4134
4150
|
self.processing_class.save_pretrained(output_dir)
|
|
@@ -4827,6 +4843,7 @@ class Trainer:
|
|
|
4827
4843
|
if not self.args.hub_always_push and self.push_in_progress is not None and not self.push_in_progress.is_done():
|
|
4828
4844
|
return
|
|
4829
4845
|
|
|
4846
|
+
self.callback_handler.on_push_begin(self.args, self.state, self.control)
|
|
4830
4847
|
output_dir = self.args.output_dir
|
|
4831
4848
|
# To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder
|
|
4832
4849
|
modeling_files = [CONFIG_NAME, GENERATION_CONFIG_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_NAME]
|
|
@@ -4921,6 +4938,8 @@ class Trainer:
|
|
|
4921
4938
|
The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the
|
|
4922
4939
|
progress of the commit if `blocking=True`.
|
|
4923
4940
|
"""
|
|
4941
|
+
self.callback_handler.on_push_begin(self.args, self.state, self.control)
|
|
4942
|
+
|
|
4924
4943
|
model_name = kwargs.pop("model_name", None)
|
|
4925
4944
|
if model_name is None and self.args.should_save:
|
|
4926
4945
|
if self.args.hub_model_id is None:
|
|
@@ -5074,14 +5093,14 @@ class Trainer:
|
|
|
5074
5093
|
self.is_tp_enabled = False
|
|
5075
5094
|
if getattr(self.model, "tp_size", None) is not None and self.model.tp_size > 1:
|
|
5076
5095
|
self.is_tp_enabled = True
|
|
5077
|
-
if self.args.parallelism_config is
|
|
5078
|
-
if is_accelerate_available("1.
|
|
5079
|
-
if self.args.parallelism_config is
|
|
5096
|
+
if self.args.parallelism_config is None:
|
|
5097
|
+
if is_accelerate_available("1.12.0"):
|
|
5098
|
+
if self.args.parallelism_config is None:
|
|
5080
5099
|
from accelerate import ParallelismConfig
|
|
5081
5100
|
|
|
5082
5101
|
args["parallelism_config"] = ParallelismConfig(tp_size=self.model.tp_size)
|
|
5083
5102
|
else:
|
|
5084
|
-
raise ValueError("Requires accelerate>1.
|
|
5103
|
+
raise ValueError("Requires accelerate>1.12.0 to use Tensor Parallelism.")
|
|
5085
5104
|
|
|
5086
5105
|
if is_accelerate_available("1.2.0"):
|
|
5087
5106
|
# it we don't have the correct version, we will rely on env var instead that were set in TrainingArguments
|
transformers/trainer_callback.py
CHANGED
|
@@ -420,6 +420,11 @@ class TrainerCallback:
|
|
|
420
420
|
Event called after a prediction step.
|
|
421
421
|
"""
|
|
422
422
|
|
|
423
|
+
def on_push_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
|
|
424
|
+
"""
|
|
425
|
+
Event called before pushing the model to the hub, at the beginning of Trainer.push_to_hub and Trainer._push_from_checkpoint.
|
|
426
|
+
"""
|
|
427
|
+
|
|
423
428
|
|
|
424
429
|
class CallbackHandler(TrainerCallback):
|
|
425
430
|
"""Internal class that just calls the list of callbacks in order."""
|
|
@@ -532,6 +537,9 @@ class CallbackHandler(TrainerCallback):
|
|
|
532
537
|
def on_prediction_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
|
|
533
538
|
return self.call_event("on_prediction_step", args, state, control)
|
|
534
539
|
|
|
540
|
+
def on_push_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
|
|
541
|
+
return self.call_event("on_push_begin", args, state, control, **kwargs)
|
|
542
|
+
|
|
535
543
|
def call_event(self, event, args, state, control, **kwargs):
|
|
536
544
|
for callback in self.callbacks:
|
|
537
545
|
result = getattr(callback, event)(
|