transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import signal
|
|
3
|
+
import threading
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from .trainer_callback import TrainerCallback
|
|
7
|
+
from .trainer_utils import PREFIX_CHECKPOINT_DIR
|
|
8
|
+
from .utils import logging
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
logger = logging.get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CheckpointManager:
|
|
15
|
+
def __init__(self, trainer, kill_wait: int = 3):
|
|
16
|
+
"""
|
|
17
|
+
Initialize the CheckpointManager for Just-In-Time checkpoint handling.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
trainer: The Trainer instance that will be used to save checkpoints when SIGTERM is received.
|
|
21
|
+
kill_wait (`int`, *optional*, defaults to 3): Grace period to distinguish between SIGTERM and SIGKILL.
|
|
22
|
+
"""
|
|
23
|
+
self.trainer = trainer
|
|
24
|
+
self.is_checkpoint_requested = False
|
|
25
|
+
self._original_sigterm_handler = None
|
|
26
|
+
self.kill_wait = kill_wait
|
|
27
|
+
|
|
28
|
+
def setup_signal_handler(self):
|
|
29
|
+
self._original_sigterm_handler = signal.signal(signal.SIGTERM, self._sigterm_handler)
|
|
30
|
+
logger.info("JIT checkpoint signal handler registered for SIGTERM")
|
|
31
|
+
|
|
32
|
+
def _sigterm_handler(self, signum, frame):
|
|
33
|
+
if self.is_checkpoint_requested:
|
|
34
|
+
return
|
|
35
|
+
|
|
36
|
+
logger.info(f"SIGTERM received, will request JIT checkpoint after {self.kill_wait}s")
|
|
37
|
+
threading.Timer(self.kill_wait, self._enable_checkpoint).start()
|
|
38
|
+
|
|
39
|
+
def _enable_checkpoint(self):
|
|
40
|
+
logger.info("Kill wait period elapsed, requesting checkpoint")
|
|
41
|
+
self.is_checkpoint_requested = True
|
|
42
|
+
|
|
43
|
+
def execute_jit_checkpoint(self):
|
|
44
|
+
try:
|
|
45
|
+
# Set checkpoint flag to False to avoid multiple checkpoints getting triggered by other callbacks
|
|
46
|
+
self.is_checkpoint_requested = False
|
|
47
|
+
|
|
48
|
+
logger.info("Starting JIT checkpointing...")
|
|
49
|
+
current_step = self.trainer.state.global_step
|
|
50
|
+
logger.info(f"Saving JIT checkpoint at step {current_step}")
|
|
51
|
+
|
|
52
|
+
output_dir = self.trainer._get_output_dir(trial=None)
|
|
53
|
+
checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{current_step}"
|
|
54
|
+
checkpoint_path = os.path.join(output_dir, checkpoint_folder)
|
|
55
|
+
|
|
56
|
+
# Create checkpoint directory
|
|
57
|
+
os.makedirs(checkpoint_path, exist_ok=True)
|
|
58
|
+
|
|
59
|
+
# Create a sentinel file to indicate checkpointing is in progress
|
|
60
|
+
sentinel_file = os.path.join(output_dir, checkpoint_folder, "checkpoint-is-incomplete.txt")
|
|
61
|
+
with open(sentinel_file, "w") as f:
|
|
62
|
+
f.write(f"Checkpoint started at step {current_step} and in progress...")
|
|
63
|
+
logger.info(f"Created checkpoint progress sentinel marker file: {sentinel_file}")
|
|
64
|
+
|
|
65
|
+
# Invoke the trainer's checkpoint method directly
|
|
66
|
+
self.trainer._save_checkpoint(self.trainer.model, trial=None)
|
|
67
|
+
|
|
68
|
+
# Remove sentinel file upon successful checkpointing
|
|
69
|
+
if os.path.exists(sentinel_file):
|
|
70
|
+
os.remove(sentinel_file)
|
|
71
|
+
logger.info("Sentinel marker file removed")
|
|
72
|
+
|
|
73
|
+
logger.info("Immediate JIT checkpoint completed successfully")
|
|
74
|
+
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logger.error(f"Failed to save JIT checkpoint: {e}")
|
|
77
|
+
raise
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class JITCheckpointCallback(TrainerCallback):
|
|
81
|
+
"""
|
|
82
|
+
Callback for Just-In-Time checkpointing on SIGTERM signals.
|
|
83
|
+
|
|
84
|
+
When SIGTERM is received, the checkpoint manager sets `is_checkpoint_requested=True`.
|
|
85
|
+
The callbacks detect this flag and set `control.should_training_stop=True`, which signals
|
|
86
|
+
the Trainer's training loop to exit gracefully after saving the checkpoint.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(self):
|
|
90
|
+
self.trainer = None
|
|
91
|
+
self.jit_manager: Optional[CheckpointManager] = None
|
|
92
|
+
|
|
93
|
+
def set_trainer(self, trainer):
|
|
94
|
+
self.trainer = trainer
|
|
95
|
+
if trainer.args.enable_jit_checkpoint:
|
|
96
|
+
self.jit_manager = CheckpointManager(trainer=trainer)
|
|
97
|
+
self.jit_manager.setup_signal_handler()
|
|
98
|
+
logger.info("JIT checkpointing enabled")
|
|
99
|
+
|
|
100
|
+
def on_pre_optimizer_step(self, args, state, control, **kwargs):
|
|
101
|
+
if self.jit_manager and self.jit_manager.is_checkpoint_requested:
|
|
102
|
+
control.should_training_stop = True
|
|
103
|
+
self.jit_manager.execute_jit_checkpoint()
|
|
104
|
+
|
|
105
|
+
def on_step_begin(self, args, state, control, **kwargs):
|
|
106
|
+
if self.jit_manager and self.jit_manager.is_checkpoint_requested:
|
|
107
|
+
control.should_training_stop = True
|
|
108
|
+
self.jit_manager.execute_jit_checkpoint()
|
|
109
|
+
|
|
110
|
+
def on_step_end(self, args, state, control, **kwargs):
|
|
111
|
+
if self.jit_manager and self.jit_manager.is_checkpoint_requested:
|
|
112
|
+
control.should_save = False
|
|
113
|
+
control.should_training_stop = True
|
|
114
|
+
self.jit_manager.execute_jit_checkpoint()
|
|
115
|
+
|
|
116
|
+
def on_epoch_end(self, args, state, control, **kwargs):
|
|
117
|
+
if self.jit_manager and self.jit_manager.is_checkpoint_requested:
|
|
118
|
+
control.should_save = False
|
|
119
|
+
control.should_training_stop = True
|
|
120
|
+
self.jit_manager.execute_jit_checkpoint()
|
|
121
|
+
|
|
122
|
+
def on_train_end(self, args, state, control, **kwargs):
|
|
123
|
+
# Restore original SIGTERM handler
|
|
124
|
+
if self.jit_manager and self.jit_manager._original_sigterm_handler is not None:
|
|
125
|
+
signal.signal(signal.SIGTERM, self.jit_manager._original_sigterm_handler)
|
|
126
|
+
logger.info("Restored original SIGTERM handler after training completion")
|
transformers/trainer_seq2seq.py
CHANGED
|
@@ -333,7 +333,11 @@ class Seq2SeqTrainer(Trainer):
|
|
|
333
333
|
self.model.generation_config._from_model_config = False
|
|
334
334
|
|
|
335
335
|
# Retrieves GenerationConfig from model.generation_config
|
|
336
|
+
# Update with defaults because earlier the generation config used ot be init
|
|
337
|
+
# with default values. Now we init it with `None` and keep defaults for BC
|
|
336
338
|
gen_config = self.model.generation_config
|
|
339
|
+
default_gen_config = gen_config._get_default_generation_params()
|
|
340
|
+
gen_config.update(**default_gen_config, defaults_only=True)
|
|
337
341
|
# in case the batch is shorter than max length, the output should be padded
|
|
338
342
|
if generated_tokens.shape[-1] < gen_config.max_length:
|
|
339
343
|
generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length)
|
transformers/trainer_utils.py
CHANGED
|
@@ -924,7 +924,7 @@ def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
|
|
|
924
924
|
shard_files = list(set(index["weight_map"].values()))
|
|
925
925
|
|
|
926
926
|
# If strict=True, error before loading any of the state dicts.
|
|
927
|
-
# TODO: Here, update the
|
|
927
|
+
# TODO: Here, update the weight map with the config.dynamic_weight_conversion
|
|
928
928
|
loaded_keys = index["weight_map"].keys()
|
|
929
929
|
model_keys = model.state_dict().keys()
|
|
930
930
|
missing_keys = [key for key in model_keys if key not in loaded_keys]
|
transformers/training_args.py
CHANGED
|
@@ -340,9 +340,17 @@ class TrainingArguments:
|
|
|
340
340
|
`save_total_limit=5` and `load_best_model_at_end`, the four last checkpoints will always be retained
|
|
341
341
|
alongside the best model. When `save_total_limit=1` and `load_best_model_at_end`, it is possible that two
|
|
342
342
|
checkpoints are saved: the last one and the best one (if they are different).
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
343
|
+
enable_jit_checkpoint (`bool`, *optional*, defaults to `False`):
|
|
344
|
+
Whether to enable Just-In-Time (JIT) checkpointing on SIGTERM signal. When enabled, training will
|
|
345
|
+
checkpoint upon receiving SIGTERM, allowing for graceful termination without losing
|
|
346
|
+
progress. This is particularly useful for shared clusters with preemptible workloads (e.g., Kueue).
|
|
347
|
+
**Important**: You must configure your orchestrator's graceful shutdown period to allow sufficient time
|
|
348
|
+
for checkpoint completion. For Kubernetes, set `terminationGracePeriodSeconds` in your job definition
|
|
349
|
+
(method varies by cloud-native trainer: Kubeflow, Ray, etc.). Note: the default is only 30 seconds,
|
|
350
|
+
which is typically insufficient. For Slurm, use `--signal=USR1@<seconds>` in your sbatch script to send
|
|
351
|
+
SIGTERM with adequate time before the job time limit. Calculate the required grace period as: longest
|
|
352
|
+
possible iteration time + checkpoint saving time. For example, if an iteration takes 2 minutes and
|
|
353
|
+
checkpoint saving takes 2 minutes, set at least 4 minutes (240 seconds) of grace time.
|
|
346
354
|
save_on_each_node (`bool`, *optional*, defaults to `False`):
|
|
347
355
|
When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on
|
|
348
356
|
the main one.
|
|
@@ -585,9 +593,9 @@ class TrainingArguments:
|
|
|
585
593
|
instance of `Dataset`.
|
|
586
594
|
report_to (`str` or `list[str]`, *optional*, defaults to `"none"`):
|
|
587
595
|
The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
|
|
588
|
-
`"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"
|
|
589
|
-
`"
|
|
590
|
-
|
|
596
|
+
`"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"swanlab"`,
|
|
597
|
+
`"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"`
|
|
598
|
+
for no integrations.
|
|
591
599
|
project (`str`, *optional*, defaults to `"huggingface"`):
|
|
592
600
|
The name of the project to use for logging. Currently, only used by Trackio.
|
|
593
601
|
trackio_space_id (`str` or `None`, *optional*, defaults to `"trackio"`):
|
|
@@ -852,7 +860,7 @@ class TrainingArguments:
|
|
|
852
860
|
warmup_ratio: float | None = field(
|
|
853
861
|
default=None,
|
|
854
862
|
metadata={
|
|
855
|
-
"help": "This argument is deprecated and will be removed in v5. Use `warmup_steps` instead as it also works with float values."
|
|
863
|
+
"help": "This argument is deprecated and will be removed in v5.2. Use `warmup_steps` instead as it also works with float values."
|
|
856
864
|
},
|
|
857
865
|
)
|
|
858
866
|
|
|
@@ -929,14 +937,24 @@ class TrainingArguments:
|
|
|
929
937
|
" for `save_total_limit=5` and `load_best_model_at_end=True`, the four last checkpoints will always be"
|
|
930
938
|
" retained alongside the best model. When `save_total_limit=1` and `load_best_model_at_end=True`,"
|
|
931
939
|
" it is possible that two checkpoints are saved: the last one and the best one (if they are different)."
|
|
932
|
-
" Default is unlimited checkpoints"
|
|
940
|
+
" Default is unlimited checkpoints."
|
|
933
941
|
)
|
|
934
942
|
},
|
|
935
943
|
)
|
|
936
|
-
|
|
937
|
-
default=
|
|
944
|
+
enable_jit_checkpoint: bool = field(
|
|
945
|
+
default=False,
|
|
938
946
|
metadata={
|
|
939
|
-
"help":
|
|
947
|
+
"help": (
|
|
948
|
+
"Whether to enable Just-In-Time (JIT) checkpointing on SIGTERM signal. "
|
|
949
|
+
"When enabled, training will checkpoint upon receiving SIGTERM, "
|
|
950
|
+
"allowing for graceful termination without losing progress. "
|
|
951
|
+
"This is particularly useful for shared clusters with preemptible workloads (Kueue). "
|
|
952
|
+
"IMPORTANT: You must configure your orchestrator's graceful shutdown period. "
|
|
953
|
+
"Kubernetes: set terminationGracePeriodSeconds (default 30s is insufficient!) in your job definition. "
|
|
954
|
+
"Slurm: use --signal=USR1@<seconds> in sbatch to send SIGTERM before time limit. "
|
|
955
|
+
"Calculate required grace period as: iteration time + checkpoint saving time. "
|
|
956
|
+
"Example: 2min iteration + 2min checkpoint = 240 seconds minimum."
|
|
957
|
+
)
|
|
940
958
|
},
|
|
941
959
|
)
|
|
942
960
|
save_on_each_node: bool = field(
|
|
@@ -1504,14 +1522,6 @@ class TrainingArguments:
|
|
|
1504
1522
|
f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."
|
|
1505
1523
|
)
|
|
1506
1524
|
|
|
1507
|
-
if not self.save_safetensors:
|
|
1508
|
-
logger.info(
|
|
1509
|
-
f"Found safetensors installation, but --save_safetensors={self.save_safetensors}. "
|
|
1510
|
-
f"Safetensors should be a preferred weights saving format due to security and performance reasons. "
|
|
1511
|
-
f"If your model cannot be saved by safetensors please feel free to open an issue at "
|
|
1512
|
-
f"https://github.com/huggingface/safetensors!"
|
|
1513
|
-
)
|
|
1514
|
-
|
|
1515
1525
|
if (
|
|
1516
1526
|
self.load_best_model_at_end or self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU
|
|
1517
1527
|
) and self.metric_for_best_model is None:
|
|
@@ -1520,16 +1530,14 @@ class TrainingArguments:
|
|
|
1520
1530
|
self.greater_is_better = not self.metric_for_best_model.endswith("loss")
|
|
1521
1531
|
if is_torch_available():
|
|
1522
1532
|
if self.bf16 or self.bf16_full_eval:
|
|
1523
|
-
if
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
if
|
|
1528
|
-
error_message
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
# gpu
|
|
1532
|
-
raise ValueError(error_message)
|
|
1533
|
+
if (
|
|
1534
|
+
not self.use_cpu and not is_torch_bf16_gpu_available() and not is_torch_xla_available()
|
|
1535
|
+
): # added for tpu support
|
|
1536
|
+
error_message = "Your setup doesn't support bf16/gpu. You need to assign use_cpu if you want to train the model on CPU"
|
|
1537
|
+
if is_torch_cuda_available():
|
|
1538
|
+
error_message += " You need Ampere+ GPU with cuda>=11.0"
|
|
1539
|
+
# gpu
|
|
1540
|
+
raise ValueError(error_message)
|
|
1533
1541
|
|
|
1534
1542
|
if self.fp16 and self.bf16:
|
|
1535
1543
|
raise ValueError("At most one of fp16 and bf16 can be True, but not both")
|
|
@@ -2359,8 +2367,8 @@ class TrainingArguments:
|
|
|
2359
2367
|
report_to (`str` or `list[str]`, *optional*, defaults to `"none"`):
|
|
2360
2368
|
The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
|
|
2361
2369
|
`"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`,
|
|
2362
|
-
`"
|
|
2363
|
-
|
|
2370
|
+
`"swanlab"`, `"tensorboard"`, `"trackio"` and `"wandb"`. Use `"all"` to report to all integrations
|
|
2371
|
+
installed, `"none"` for no integrations.
|
|
2364
2372
|
first_step (`bool`, *optional*, defaults to `False`):
|
|
2365
2373
|
Whether to log and evaluate the first `global_step` or not.
|
|
2366
2374
|
nan_inf_filter (`bool`, *optional*, defaults to `True`):
|
|
@@ -2565,7 +2573,7 @@ class TrainingArguments:
|
|
|
2565
2573
|
```
|
|
2566
2574
|
"""
|
|
2567
2575
|
if warmup_ratio is not None:
|
|
2568
|
-
logger.warning("warmup_ratio is deprecated and will be removed in v5. Use `warmup_steps` instead.")
|
|
2576
|
+
logger.warning("warmup_ratio is deprecated and will be removed in v5.2 . Use `warmup_steps` instead.")
|
|
2569
2577
|
warmup_steps = warmup_ratio
|
|
2570
2578
|
|
|
2571
2579
|
self.lr_scheduler_type = SchedulerType(name)
|
|
@@ -2742,10 +2750,24 @@ class TrainingArguments:
|
|
|
2742
2750
|
fsdp_plugin_args["transformer_cls_names_to_wrap"] = ",".join(
|
|
2743
2751
|
self.fsdp_config["transformer_layer_cls_to_wrap"]
|
|
2744
2752
|
)
|
|
2745
|
-
|
|
2753
|
+
fsdp_version = int(self.fsdp_config.get("version", 1))
|
|
2754
|
+
fsdp_plugin_args["fsdp_version"] = fsdp_version
|
|
2746
2755
|
prefetch_policy = self.fsdp_config.get("backward_prefetch", "NO_PREFETCH")
|
|
2747
|
-
|
|
2748
|
-
|
|
2756
|
+
if fsdp_version == 2:
|
|
2757
|
+
fsdp_plugin_args["reshard_after_forward"] = str_to_bool(
|
|
2758
|
+
str(self.fsdp_config.get("reshard_after_forward", "false")).lower()
|
|
2759
|
+
)
|
|
2760
|
+
else:
|
|
2761
|
+
fsdp_plugin_args["forward_prefetch"] = str_to_bool(
|
|
2762
|
+
str(self.fsdp_config.get("forward_prefetch", "false")).lower()
|
|
2763
|
+
)
|
|
2764
|
+
fsdp_plugin_args["backward_prefetch"] = prefetch_policy.upper()
|
|
2765
|
+
fsdp_plugin_args["reshard_after_forward"] = str(
|
|
2766
|
+
self.fsdp_config.get("reshard_after_forward", "FULL_SHARD")
|
|
2767
|
+
).lower()
|
|
2768
|
+
fsdp_plugin_args["use_orig_params"] = str_to_bool(
|
|
2769
|
+
str(self.fsdp_config.get("use_orig_params", "true")).lower()
|
|
2770
|
+
)
|
|
2749
2771
|
|
|
2750
2772
|
sync_module_states = str(self.fsdp_config.get("sync_module_states", "true")).lower()
|
|
2751
2773
|
cpu_ram_efficient_loading = str(self.fsdp_config.get("cpu_ram_efficient_loading", "false")).lower()
|
|
@@ -2755,11 +2777,10 @@ class TrainingArguments:
|
|
|
2755
2777
|
raise ValueError('`sync_module_states` must be `"True"` if `cpu_ram_efficient_loading` is `"True"`')
|
|
2756
2778
|
|
|
2757
2779
|
# we need to set the env here as otherwise we get a warning in accelerate + we need to set it for transformers
|
|
2758
|
-
fsdp_plugin_args["cpu_ram_efficient_loading"] = cpu_ram_efficient_loading
|
|
2780
|
+
fsdp_plugin_args["cpu_ram_efficient_loading"] = str_to_bool(cpu_ram_efficient_loading)
|
|
2759
2781
|
os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = cpu_ram_efficient_loading
|
|
2760
2782
|
|
|
2761
|
-
fsdp_plugin_args["sync_module_states"] = sync_module_states
|
|
2762
|
-
fsdp_plugin_args["use_orig_params"] = str(self.fsdp_config.get("use_orig_params", "true")).lower()
|
|
2783
|
+
fsdp_plugin_args["sync_module_states"] = str_to_bool(sync_module_states)
|
|
2763
2784
|
|
|
2764
2785
|
return fsdp_plugin_args
|
|
2765
2786
|
|
|
@@ -2771,3 +2792,18 @@ class ParallelMode(Enum):
|
|
|
2771
2792
|
SAGEMAKER_MODEL_PARALLEL = "sagemaker_model_parallel"
|
|
2772
2793
|
SAGEMAKER_DATA_PARALLEL = "sagemaker_data_parallel"
|
|
2773
2794
|
TPU = "tpu"
|
|
2795
|
+
|
|
2796
|
+
|
|
2797
|
+
def str_to_bool(value, to_bool: bool = True) -> int | bool:
|
|
2798
|
+
"""
|
|
2799
|
+
Converts a string representation of truth to `True` (1) or `False` (0).
|
|
2800
|
+
|
|
2801
|
+
True values are `y`, `yes`, `t`, `true`, `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`;
|
|
2802
|
+
"""
|
|
2803
|
+
value = value.lower()
|
|
2804
|
+
if value in ("y", "yes", "t", "true", "on", "1"):
|
|
2805
|
+
return 1 if not to_bool else True
|
|
2806
|
+
elif value in ("n", "no", "f", "false", "off", "0"):
|
|
2807
|
+
return 0 if not to_bool else False
|
|
2808
|
+
else:
|
|
2809
|
+
raise ValueError(f"invalid truth value {value}")
|
transformers/utils/__init__.py
CHANGED
|
@@ -49,6 +49,7 @@ from .generic import (
|
|
|
49
49
|
PaddingStrategy,
|
|
50
50
|
TensorType,
|
|
51
51
|
TransformersKwargs,
|
|
52
|
+
_is_tensor_or_array_like,
|
|
52
53
|
can_return_loss,
|
|
53
54
|
can_return_tuple,
|
|
54
55
|
expand_dims,
|
|
@@ -91,7 +92,6 @@ from .hub import (
|
|
|
91
92
|
extract_commit_hash,
|
|
92
93
|
has_file,
|
|
93
94
|
http_user_agent,
|
|
94
|
-
is_offline_mode,
|
|
95
95
|
list_repo_templates,
|
|
96
96
|
try_to_load_from_cache,
|
|
97
97
|
)
|
|
@@ -114,8 +114,6 @@ from .import_utils import (
|
|
|
114
114
|
is_apex_available,
|
|
115
115
|
is_apollo_torch_available,
|
|
116
116
|
is_aqlm_available,
|
|
117
|
-
is_auto_awq_available,
|
|
118
|
-
is_auto_gptq_available,
|
|
119
117
|
is_auto_round_available,
|
|
120
118
|
is_av_available,
|
|
121
119
|
is_bitsandbytes_available,
|
|
@@ -129,7 +127,8 @@ from .import_utils import (
|
|
|
129
127
|
is_datasets_available,
|
|
130
128
|
is_decord_available,
|
|
131
129
|
is_detectron2_available,
|
|
132
|
-
|
|
130
|
+
is_env_variable_false,
|
|
131
|
+
is_env_variable_true,
|
|
133
132
|
is_essentia_available,
|
|
134
133
|
is_faiss_available,
|
|
135
134
|
is_fbgemm_gpu_available,
|
|
@@ -146,6 +145,7 @@ from .import_utils import (
|
|
|
146
145
|
is_gguf_available,
|
|
147
146
|
is_gptqmodel_available,
|
|
148
147
|
is_grokadamw_available,
|
|
148
|
+
is_grouped_mm_available,
|
|
149
149
|
is_habana_gaudi1,
|
|
150
150
|
is_hadamard_available,
|
|
151
151
|
is_hqq_available,
|
|
@@ -161,6 +161,7 @@ from .import_utils import (
|
|
|
161
161
|
is_libcst_available,
|
|
162
162
|
is_librosa_available,
|
|
163
163
|
is_liger_kernel_available,
|
|
164
|
+
is_llm_awq_available,
|
|
164
165
|
is_lomo_available,
|
|
165
166
|
is_matplotlib_available,
|
|
166
167
|
is_mistral_common_available,
|
|
@@ -169,6 +170,7 @@ from .import_utils import (
|
|
|
169
170
|
is_ninja_available,
|
|
170
171
|
is_nltk_available,
|
|
171
172
|
is_num2words_available,
|
|
173
|
+
is_numba_available,
|
|
172
174
|
is_onnx_available,
|
|
173
175
|
is_openai_available,
|
|
174
176
|
is_optimum_available,
|
|
@@ -183,6 +185,7 @@ from .import_utils import (
|
|
|
183
185
|
is_pyctcdecode_available,
|
|
184
186
|
is_pytesseract_available,
|
|
185
187
|
is_pytest_available,
|
|
188
|
+
is_pytest_order_available,
|
|
186
189
|
is_pytorch_quantization_available,
|
|
187
190
|
is_quanto_greater,
|
|
188
191
|
is_quark_available,
|
|
@@ -21,7 +21,7 @@ from ..models.auto.auto_factory import _get_model_class
|
|
|
21
21
|
from ..models.auto.configuration_auto import AutoConfig
|
|
22
22
|
from ..models.auto.modeling_auto import MODEL_FOR_PRETRAINING_MAPPING, MODEL_MAPPING
|
|
23
23
|
from ..models.auto.processing_auto import PROCESSOR_MAPPING_NAMES, AutoProcessor
|
|
24
|
-
from ..models.auto.tokenization_auto import
|
|
24
|
+
from ..models.auto.tokenization_auto import AutoTokenizer
|
|
25
25
|
from .import_utils import is_torch_available
|
|
26
26
|
|
|
27
27
|
|
|
@@ -199,12 +199,12 @@ class AttentionMaskVisualizer:
|
|
|
199
199
|
if "token_type_ids" in inputs: # TODO inspect signature of update causal mask
|
|
200
200
|
kwargs["token_type_ids"] = inputs["token_type_ids"]
|
|
201
201
|
tokens = processor.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
|
202
|
-
|
|
202
|
+
else:
|
|
203
203
|
tokenizer = AutoTokenizer.from_pretrained(self.repo_id)
|
|
204
204
|
tokens = tokenizer.tokenize(input_sentence)
|
|
205
205
|
attention_mask = tokenizer(input_sentence, return_tensors="pt")["attention_mask"]
|
|
206
|
-
|
|
207
|
-
|
|
206
|
+
if attention_mask is None:
|
|
207
|
+
raise ValueError(f"Model type {self.config.model_type} does not support attention visualization")
|
|
208
208
|
|
|
209
209
|
model.config._attn_implementation = "eager"
|
|
210
210
|
model.train()
|
|
@@ -17,7 +17,8 @@ import inspect
|
|
|
17
17
|
import os
|
|
18
18
|
import textwrap
|
|
19
19
|
from pathlib import Path
|
|
20
|
-
from
|
|
20
|
+
from types import UnionType
|
|
21
|
+
from typing import Union, get_args, get_origin
|
|
21
22
|
|
|
22
23
|
import regex as re
|
|
23
24
|
|
|
@@ -67,6 +68,7 @@ HARDCODED_CONFIG_FOR_MODELS = {
|
|
|
67
68
|
"donut": "DonutSwinConfig",
|
|
68
69
|
"esmfold": "EsmConfig",
|
|
69
70
|
"parakeet": "ParakeetCTCConfig",
|
|
71
|
+
"lasr": "LasrCTCConfig",
|
|
70
72
|
}
|
|
71
73
|
|
|
72
74
|
_re_checkpoint = re.compile(r"\[(.+?)\]\((https://huggingface\.co/.+?)\)")
|
|
@@ -1279,38 +1281,46 @@ def _get_model_info(func, parent_class):
|
|
|
1279
1281
|
return model_name_lowercase, class_name, config_class
|
|
1280
1282
|
|
|
1281
1283
|
|
|
1282
|
-
def _process_parameter_type(param
|
|
1284
|
+
def _process_parameter_type(param):
|
|
1283
1285
|
"""
|
|
1284
1286
|
Process and format a parameter's type annotation.
|
|
1285
1287
|
|
|
1286
1288
|
Args:
|
|
1287
1289
|
param (`inspect.Parameter`): The parameter from the function signature
|
|
1288
|
-
param_name (`str`): The name of the parameter
|
|
1289
|
-
func (`function`): The function the parameter belongs to
|
|
1290
1290
|
"""
|
|
1291
1291
|
optional = False
|
|
1292
|
-
if param.annotation
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
)
|
|
1305
|
-
if "ForwardRef" in param_type:
|
|
1306
|
-
param_type = re.sub(r"ForwardRef\('([\w.]+)'\)", r"\1", param_type)
|
|
1307
|
-
if "Optional" in param_type:
|
|
1308
|
-
param_type = re.sub(r"Optional\[(.*?)\]", r"\1", param_type)
|
|
1292
|
+
if param.annotation == inspect.Parameter.empty:
|
|
1293
|
+
return "", False
|
|
1294
|
+
elif param.annotation is None:
|
|
1295
|
+
return "None", True
|
|
1296
|
+
# This is, astonishingly, the right way to do it: https://docs.python.org/3/library/typing.html#typing.Union
|
|
1297
|
+
elif get_origin(param.annotation) is Union or get_origin(param.annotation) is UnionType:
|
|
1298
|
+
subtypes = get_args(param.annotation)
|
|
1299
|
+
else:
|
|
1300
|
+
subtypes = [param.annotation] # Just pretend it's a single-element union so we don't need two code paths
|
|
1301
|
+
out_str = []
|
|
1302
|
+
for subtype in subtypes:
|
|
1303
|
+
if subtype is type(None):
|
|
1309
1304
|
optional = True
|
|
1305
|
+
continue
|
|
1306
|
+
if hasattr(subtype, "__module__") and hasattr(subtype, "__name__"):
|
|
1307
|
+
subtype = f"{subtype.__module__.replace('transformers.', '~').replace('builtins', '').replace('typing.', '')}.{subtype.__name__}".removeprefix(
|
|
1308
|
+
"."
|
|
1309
|
+
)
|
|
1310
|
+
else:
|
|
1311
|
+
subtype = str(subtype) # Just give up
|
|
1312
|
+
if "ForwardRef" in subtype:
|
|
1313
|
+
subtype = re.sub(r"ForwardRef\('([\w.]+)'\)", r"\1", subtype)
|
|
1314
|
+
out_str.append(subtype)
|
|
1315
|
+
|
|
1316
|
+
if param.default is not inspect.Parameter.empty:
|
|
1317
|
+
optional = True
|
|
1318
|
+
if not out_str:
|
|
1319
|
+
return "", optional
|
|
1320
|
+
elif len(out_str) == 1:
|
|
1321
|
+
return out_str[0], optional
|
|
1310
1322
|
else:
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
return param_type, optional
|
|
1323
|
+
return f"Union[{', '.join(out_str)}]", optional
|
|
1314
1324
|
|
|
1315
1325
|
|
|
1316
1326
|
def _get_parameter_info(param_name, documented_params, source_args_dict, param_type, optional):
|
|
@@ -1391,7 +1401,7 @@ def _process_regular_parameters(
|
|
|
1391
1401
|
continue
|
|
1392
1402
|
|
|
1393
1403
|
# Process parameter type and optional status
|
|
1394
|
-
param_type, optional = _process_parameter_type(param
|
|
1404
|
+
param_type, optional = _process_parameter_type(param)
|
|
1395
1405
|
|
|
1396
1406
|
# Check for default value
|
|
1397
1407
|
param_default = ""
|
transformers/utils/generic.py
CHANGED
|
@@ -21,7 +21,7 @@ import os
|
|
|
21
21
|
import warnings
|
|
22
22
|
from collections import OrderedDict, UserDict, defaultdict
|
|
23
23
|
from collections.abc import Callable, Iterable, MutableMapping
|
|
24
|
-
from contextlib import AbstractContextManager, ExitStack
|
|
24
|
+
from contextlib import AbstractContextManager, ExitStack, nullcontext
|
|
25
25
|
from dataclasses import dataclass, fields, is_dataclass
|
|
26
26
|
from enum import Enum
|
|
27
27
|
from functools import partial, wraps
|
|
@@ -42,6 +42,7 @@ _is_torch_available = False
|
|
|
42
42
|
if is_torch_available():
|
|
43
43
|
# required for @can_return_tuple decorator to work with torchdynamo
|
|
44
44
|
import torch
|
|
45
|
+
from torch.types import _dtype
|
|
45
46
|
|
|
46
47
|
from ..model_debugging_utils import model_addition_debugger_context
|
|
47
48
|
|
|
@@ -154,6 +155,48 @@ def is_torch_dtype(x):
|
|
|
154
155
|
return isinstance(x, torch.dtype)
|
|
155
156
|
|
|
156
157
|
|
|
158
|
+
def _is_tensor_or_array_like(value):
|
|
159
|
+
"""
|
|
160
|
+
Check if a value is array-like (includes ragged arrays)
|
|
161
|
+
"""
|
|
162
|
+
if is_numpy_array(value):
|
|
163
|
+
return True
|
|
164
|
+
if is_torch_tensor(value):
|
|
165
|
+
return True
|
|
166
|
+
if isinstance(value, (int, float, bool, np.number)):
|
|
167
|
+
return True
|
|
168
|
+
|
|
169
|
+
if isinstance(value, (list, tuple)):
|
|
170
|
+
if len(value) == 0:
|
|
171
|
+
# consider empty list or nested list as array-like
|
|
172
|
+
return True
|
|
173
|
+
return _is_tensor_or_array_like(value[0])
|
|
174
|
+
|
|
175
|
+
return False
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def maybe_autocast(
|
|
179
|
+
device_type: str,
|
|
180
|
+
dtype: Optional["_dtype"] = None,
|
|
181
|
+
enabled: bool = True,
|
|
182
|
+
cache_enabled: Optional[bool] = None,
|
|
183
|
+
):
|
|
184
|
+
"""
|
|
185
|
+
Context manager that only autocasts if:
|
|
186
|
+
|
|
187
|
+
- `autocast` is already enabled in this context
|
|
188
|
+
- Or this call to `maybe_autocast` has `enabled=True`
|
|
189
|
+
|
|
190
|
+
This prevents `autocast` being added to the graph when it is effectively a no-op.
|
|
191
|
+
Which makes graph splitting in `torch.compile` more flexible as it removes the
|
|
192
|
+
requirement that partition IDs be monotonically increasing.
|
|
193
|
+
"""
|
|
194
|
+
if torch.is_autocast_enabled(device_type) or enabled:
|
|
195
|
+
return torch.autocast(device_type, dtype=dtype, enabled=enabled, cache_enabled=cache_enabled)
|
|
196
|
+
else:
|
|
197
|
+
return nullcontext()
|
|
198
|
+
|
|
199
|
+
|
|
157
200
|
def _is_mlx(x):
|
|
158
201
|
import mlx.core as mx
|
|
159
202
|
|
|
@@ -680,6 +723,8 @@ class TransformersKwargs(TypedDict, total=False):
|
|
|
680
723
|
Maximum sequence length for query state.
|
|
681
724
|
max_length_k (`int`, *optional*):
|
|
682
725
|
Maximum sequence length for key state.
|
|
726
|
+
position_ids (`torch.LongTensor`, *optional*)
|
|
727
|
+
Indices of positions of each input sequence tokens.
|
|
683
728
|
"""
|
|
684
729
|
|
|
685
730
|
num_items_in_batch: Optional["torch.Tensor"]
|
|
@@ -690,6 +735,7 @@ class TransformersKwargs(TypedDict, total=False):
|
|
|
690
735
|
cu_seq_lens_k: Optional["torch.LongTensor"]
|
|
691
736
|
max_length_q: int | None
|
|
692
737
|
max_length_k: int | None
|
|
738
|
+
position_ids: Optional["torch.LongTensor"]
|
|
693
739
|
|
|
694
740
|
|
|
695
741
|
def is_timm_config_dict(config_dict: dict[str, Any]) -> bool:
|