transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
transformers/testing_utils.py
CHANGED
|
@@ -38,6 +38,7 @@ import types
|
|
|
38
38
|
import unittest
|
|
39
39
|
from collections import UserDict, defaultdict
|
|
40
40
|
from collections.abc import Callable, Generator, Iterable, Iterator, Mapping
|
|
41
|
+
from contextlib import contextmanager
|
|
41
42
|
from dataclasses import MISSING, fields
|
|
42
43
|
from functools import cache, wraps
|
|
43
44
|
from io import StringIO
|
|
@@ -72,13 +73,13 @@ from .integrations.deepspeed import is_deepspeed_available
|
|
|
72
73
|
from .utils import (
|
|
73
74
|
ACCELERATE_MIN_VERSION,
|
|
74
75
|
GGUF_MIN_VERSION,
|
|
76
|
+
SAFE_WEIGHTS_INDEX_NAME,
|
|
75
77
|
TRITON_MIN_VERSION,
|
|
78
|
+
WEIGHTS_INDEX_NAME,
|
|
76
79
|
is_accelerate_available,
|
|
77
80
|
is_apex_available,
|
|
78
81
|
is_apollo_torch_available,
|
|
79
82
|
is_aqlm_available,
|
|
80
|
-
is_auto_awq_available,
|
|
81
|
-
is_auto_gptq_available,
|
|
82
83
|
is_auto_round_available,
|
|
83
84
|
is_av_available,
|
|
84
85
|
is_bitsandbytes_available,
|
|
@@ -88,7 +89,6 @@ from .utils import (
|
|
|
88
89
|
is_cython_available,
|
|
89
90
|
is_decord_available,
|
|
90
91
|
is_detectron2_available,
|
|
91
|
-
is_eetq_available,
|
|
92
92
|
is_essentia_available,
|
|
93
93
|
is_faiss_available,
|
|
94
94
|
is_fbgemm_gpu_available,
|
|
@@ -118,6 +118,7 @@ from .utils import (
|
|
|
118
118
|
is_mistral_common_available,
|
|
119
119
|
is_natten_available,
|
|
120
120
|
is_nltk_available,
|
|
121
|
+
is_numba_available,
|
|
121
122
|
is_onnx_available,
|
|
122
123
|
is_openai_available,
|
|
123
124
|
is_optimum_available,
|
|
@@ -130,6 +131,7 @@ from .utils import (
|
|
|
130
131
|
is_pyctcdecode_available,
|
|
131
132
|
is_pytesseract_available,
|
|
132
133
|
is_pytest_available,
|
|
134
|
+
is_pytest_order_available,
|
|
133
135
|
is_pytorch_quantization_available,
|
|
134
136
|
is_quark_available,
|
|
135
137
|
is_qutlass_available,
|
|
@@ -219,14 +221,19 @@ _COMMON_MODEL_NAMES_MAP = {
|
|
|
219
221
|
|
|
220
222
|
if is_torch_available():
|
|
221
223
|
import torch
|
|
224
|
+
from safetensors.torch import load_file
|
|
225
|
+
|
|
226
|
+
from .modeling_utils import FLASH_ATTN_KERNEL_FALLBACK, PreTrainedModel
|
|
222
227
|
|
|
223
228
|
IS_ROCM_SYSTEM = torch.version.hip is not None
|
|
224
229
|
IS_CUDA_SYSTEM = torch.version.cuda is not None
|
|
225
230
|
IS_XPU_SYSTEM = getattr(torch.version, "xpu", None) is not None
|
|
231
|
+
IS_NPU_SYSTEM = getattr(torch, "npu", None) is not None
|
|
226
232
|
else:
|
|
227
233
|
IS_ROCM_SYSTEM = False
|
|
228
234
|
IS_CUDA_SYSTEM = False
|
|
229
235
|
IS_XPU_SYSTEM = False
|
|
236
|
+
IS_NPU_SYSTEM = False
|
|
230
237
|
|
|
231
238
|
logger = transformers_logging.get_logger(__name__)
|
|
232
239
|
|
|
@@ -266,6 +273,7 @@ _run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=Fa
|
|
|
266
273
|
_run_staging = parse_flag_from_env("HUGGINGFACE_CO_STAGING", default=False)
|
|
267
274
|
_run_pipeline_tests = parse_flag_from_env("RUN_PIPELINE_TESTS", default=True)
|
|
268
275
|
_run_agent_tests = parse_flag_from_env("RUN_AGENT_TESTS", default=False)
|
|
276
|
+
_run_training_tests = parse_flag_from_env("RUN_TRAINING_TESTS", default=True)
|
|
269
277
|
|
|
270
278
|
|
|
271
279
|
def is_staging_test(test_case):
|
|
@@ -316,6 +324,22 @@ def is_agent_test(test_case):
|
|
|
316
324
|
return pytest.mark.is_agent_test()(test_case)
|
|
317
325
|
|
|
318
326
|
|
|
327
|
+
def is_training_test(test_case):
|
|
328
|
+
"""
|
|
329
|
+
Decorator marking a test as a training test. If RUN_TRAINING_TESTS is set to a falsy value, those tests will be
|
|
330
|
+
skipped.
|
|
331
|
+
"""
|
|
332
|
+
if not _run_training_tests:
|
|
333
|
+
return unittest.skip(reason="test is training test")(test_case)
|
|
334
|
+
else:
|
|
335
|
+
try:
|
|
336
|
+
import pytest # We don't need a hard dependency on pytest in the main library
|
|
337
|
+
except ImportError:
|
|
338
|
+
return test_case
|
|
339
|
+
else:
|
|
340
|
+
return pytest.mark.is_training_test()(test_case)
|
|
341
|
+
|
|
342
|
+
|
|
319
343
|
def slow(test_case):
|
|
320
344
|
"""
|
|
321
345
|
Decorator marking a test as slow.
|
|
@@ -598,7 +622,7 @@ def require_flash_attn(test_case):
|
|
|
598
622
|
try:
|
|
599
623
|
from kernels import get_kernel
|
|
600
624
|
|
|
601
|
-
get_kernel("
|
|
625
|
+
get_kernel(FLASH_ATTN_KERNEL_FALLBACK["flash_attention_2"])
|
|
602
626
|
except Exception as _:
|
|
603
627
|
kernels_available = False
|
|
604
628
|
|
|
@@ -637,6 +661,9 @@ def require_read_token(test_case):
|
|
|
637
661
|
if getattr(attr, "__require_read_token__", False):
|
|
638
662
|
continue
|
|
639
663
|
wrapped = require_read_token(attr)
|
|
664
|
+
if isinstance(inspect.getattr_static(test_case, attr_name), staticmethod):
|
|
665
|
+
# Don't accidentally bind staticmethods to `self`
|
|
666
|
+
wrapped = staticmethod(wrapped)
|
|
640
667
|
setattr(test_case, attr_name, wrapped)
|
|
641
668
|
return test_case
|
|
642
669
|
else:
|
|
@@ -649,10 +676,6 @@ def require_read_token(test_case):
|
|
|
649
676
|
with patch("huggingface_hub.utils._headers.get_token", return_value=token):
|
|
650
677
|
return test_case(*args, **kwargs)
|
|
651
678
|
else: # Allow running locally with the default token env variable
|
|
652
|
-
# dealing with static/class methods and called by `self.xxx`
|
|
653
|
-
if "staticmethod" in inspect.getsource(test_case).strip():
|
|
654
|
-
if len(args) > 0 and isinstance(args[0], unittest.TestCase):
|
|
655
|
-
return test_case(*args[1:], **kwargs)
|
|
656
679
|
return test_case(*args, **kwargs)
|
|
657
680
|
|
|
658
681
|
wrapper.__require_read_token__ = True
|
|
@@ -1070,17 +1093,20 @@ def require_torch_large_gpu(test_case, memory: float = 20):
|
|
|
1070
1093
|
)(test_case)
|
|
1071
1094
|
|
|
1072
1095
|
|
|
1073
|
-
def require_torch_large_accelerator(test_case, memory: float = 20):
|
|
1096
|
+
def require_torch_large_accelerator(test_case=None, *, memory: float = 20):
|
|
1074
1097
|
"""Decorator marking a test that requires an accelerator with more than `memory` GiB of memory."""
|
|
1075
|
-
if torch_device != "cuda" and torch_device != "xpu":
|
|
1076
|
-
return unittest.skip(reason=f"test requires a GPU or XPU with more than {memory} GiB of memory")(test_case)
|
|
1077
1098
|
|
|
1078
|
-
|
|
1099
|
+
def memory_decorator(tc):
|
|
1100
|
+
if torch_device not in ("cuda", "xpu"):
|
|
1101
|
+
return unittest.skip(f"test requires a GPU or XPU with more than {memory} GiB of memory")(tc)
|
|
1079
1102
|
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1103
|
+
torch_accel = getattr(torch, torch_device)
|
|
1104
|
+
return unittest.skipUnless(
|
|
1105
|
+
torch_accel.get_device_properties(0).total_memory / 1024**3 > memory,
|
|
1106
|
+
f"test requires a GPU or XPU with more than {memory} GiB of memory",
|
|
1107
|
+
)(tc)
|
|
1108
|
+
|
|
1109
|
+
return memory_decorator if test_case is None else memory_decorator(test_case)
|
|
1084
1110
|
|
|
1085
1111
|
|
|
1086
1112
|
def require_torch_accelerator(test_case):
|
|
@@ -1239,23 +1265,6 @@ def require_spqr(test_case):
|
|
|
1239
1265
|
return unittest.skipUnless(is_spqr_available(), "test requires spqr")(test_case)
|
|
1240
1266
|
|
|
1241
1267
|
|
|
1242
|
-
def require_eetq(test_case):
|
|
1243
|
-
"""
|
|
1244
|
-
Decorator marking a test that requires eetq
|
|
1245
|
-
"""
|
|
1246
|
-
eetq_available = is_eetq_available()
|
|
1247
|
-
if eetq_available:
|
|
1248
|
-
try:
|
|
1249
|
-
import eetq # noqa: F401
|
|
1250
|
-
except ImportError as exc:
|
|
1251
|
-
if "shard_checkpoint" in str(exc):
|
|
1252
|
-
# EETQ 1.0.0 is currently broken with the latest transformers because it tries to import the removed
|
|
1253
|
-
# shard_checkpoint function, see https://github.com/NetEase-FuXi/EETQ/issues/34.
|
|
1254
|
-
# TODO: Remove once eetq releases a fix and this release is used in CI
|
|
1255
|
-
eetq_available = False
|
|
1256
|
-
return unittest.skipUnless(eetq_available, "test requires eetq")(test_case)
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
1268
|
def require_av(test_case):
|
|
1260
1269
|
"""
|
|
1261
1270
|
Decorator marking a test that requires av
|
|
@@ -1291,13 +1300,11 @@ def require_tensorboard(test_case):
|
|
|
1291
1300
|
return unittest.skipUnless(is_tensorboard_available(), "test requires tensorboard")
|
|
1292
1301
|
|
|
1293
1302
|
|
|
1294
|
-
def
|
|
1303
|
+
def require_gptqmodel(test_case):
|
|
1295
1304
|
"""
|
|
1296
|
-
Decorator for
|
|
1305
|
+
Decorator for gptqmodel dependency
|
|
1297
1306
|
"""
|
|
1298
|
-
return unittest.skipUnless(
|
|
1299
|
-
is_gptqmodel_available() or is_auto_gptq_available(), "test requires gptqmodel or auto-gptq"
|
|
1300
|
-
)(test_case)
|
|
1307
|
+
return unittest.skipUnless(is_gptqmodel_available(), "test requires gptqmodel")(test_case)
|
|
1301
1308
|
|
|
1302
1309
|
|
|
1303
1310
|
def require_hqq(test_case):
|
|
@@ -1307,13 +1314,6 @@ def require_hqq(test_case):
|
|
|
1307
1314
|
return unittest.skipUnless(is_hqq_available(), "test requires hqq")(test_case)
|
|
1308
1315
|
|
|
1309
1316
|
|
|
1310
|
-
def require_auto_awq(test_case):
|
|
1311
|
-
"""
|
|
1312
|
-
Decorator for auto_awq dependency
|
|
1313
|
-
"""
|
|
1314
|
-
return unittest.skipUnless(is_auto_awq_available(), "test requires autoawq")(test_case)
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
1317
|
def require_auto_round(test_case):
|
|
1318
1318
|
"""
|
|
1319
1319
|
Decorator for auto_round dependency
|
|
@@ -1386,6 +1386,13 @@ def require_pyctcdecode(test_case):
|
|
|
1386
1386
|
return unittest.skipUnless(is_pyctcdecode_available(), "test requires pyctcdecode")(test_case)
|
|
1387
1387
|
|
|
1388
1388
|
|
|
1389
|
+
def require_numba(test_case):
|
|
1390
|
+
"""
|
|
1391
|
+
Decorator marking a test that requires numba
|
|
1392
|
+
"""
|
|
1393
|
+
return unittest.skipUnless(is_numba_available(), "test requires numba")(test_case)
|
|
1394
|
+
|
|
1395
|
+
|
|
1389
1396
|
def require_librosa(test_case):
|
|
1390
1397
|
"""
|
|
1391
1398
|
Decorator marking a test that requires librosa
|
|
@@ -2664,9 +2671,13 @@ def run_first(test_case):
|
|
|
2664
2671
|
single process at a time. So we make sure all tests that run in a subprocess are launched first, to avoid device
|
|
2665
2672
|
allocation conflicts.
|
|
2666
2673
|
"""
|
|
2667
|
-
|
|
2674
|
+
# Without this check, we get unwanted warnings when it's not installed
|
|
2675
|
+
if is_pytest_order_available():
|
|
2676
|
+
import pytest
|
|
2668
2677
|
|
|
2669
|
-
|
|
2678
|
+
return pytest.mark.order(1)(test_case)
|
|
2679
|
+
else:
|
|
2680
|
+
return test_case
|
|
2670
2681
|
|
|
2671
2682
|
|
|
2672
2683
|
def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None):
|
|
@@ -3192,6 +3203,8 @@ def get_device_properties() -> DeviceProperties:
|
|
|
3192
3203
|
gen_mask = 0x000000FF00000000
|
|
3193
3204
|
gen = (arch & gen_mask) >> 32
|
|
3194
3205
|
return ("xpu", gen, None)
|
|
3206
|
+
elif IS_NPU_SYSTEM:
|
|
3207
|
+
return ("npu", None, None)
|
|
3195
3208
|
else:
|
|
3196
3209
|
return (torch_device, None, None)
|
|
3197
3210
|
|
|
@@ -4092,3 +4105,267 @@ def write_file(file, content):
|
|
|
4092
4105
|
def read_json_file(file):
|
|
4093
4106
|
with open(file, "r") as fh:
|
|
4094
4107
|
return json.load(fh)
|
|
4108
|
+
|
|
4109
|
+
|
|
4110
|
+
# =============================================================================
|
|
4111
|
+
# Training CI Utilities - Logging and Memory Monitoring
|
|
4112
|
+
# =============================================================================
|
|
4113
|
+
|
|
4114
|
+
|
|
4115
|
+
# ANSI color codes for terminal output
|
|
4116
|
+
class Colors:
|
|
4117
|
+
"""ANSI color codes for terminal output formatting."""
|
|
4118
|
+
|
|
4119
|
+
RESET = "\033[0m"
|
|
4120
|
+
BOLD = "\033[1m"
|
|
4121
|
+
DIM = "\033[2m"
|
|
4122
|
+
|
|
4123
|
+
# Foreground colors
|
|
4124
|
+
RED = "\033[31m"
|
|
4125
|
+
GREEN = "\033[32m"
|
|
4126
|
+
YELLOW = "\033[33m"
|
|
4127
|
+
BLUE = "\033[34m"
|
|
4128
|
+
MAGENTA = "\033[35m"
|
|
4129
|
+
CYAN = "\033[36m"
|
|
4130
|
+
WHITE = "\033[37m"
|
|
4131
|
+
|
|
4132
|
+
# Bright variants
|
|
4133
|
+
BRIGHT_RED = "\033[91m"
|
|
4134
|
+
BRIGHT_GREEN = "\033[92m"
|
|
4135
|
+
BRIGHT_YELLOW = "\033[93m"
|
|
4136
|
+
BRIGHT_BLUE = "\033[94m"
|
|
4137
|
+
BRIGHT_CYAN = "\033[96m"
|
|
4138
|
+
|
|
4139
|
+
|
|
4140
|
+
class ColoredFormatter(logging.Formatter):
|
|
4141
|
+
"""Custom formatter that adds colors based on log level."""
|
|
4142
|
+
|
|
4143
|
+
LEVEL_COLORS = {
|
|
4144
|
+
logging.DEBUG: Colors.DIM + Colors.CYAN,
|
|
4145
|
+
logging.INFO: Colors.WHITE,
|
|
4146
|
+
logging.WARNING: Colors.BRIGHT_YELLOW,
|
|
4147
|
+
logging.ERROR: Colors.BRIGHT_RED,
|
|
4148
|
+
logging.CRITICAL: Colors.BOLD + Colors.BRIGHT_RED,
|
|
4149
|
+
}
|
|
4150
|
+
|
|
4151
|
+
# Loggers that should be dimmed (less important/verbose)
|
|
4152
|
+
DIMMED_LOGGERS = {"httpx", "httpcore", "urllib3", "requests"}
|
|
4153
|
+
|
|
4154
|
+
def __init__(self, fmt: str | None = None, datefmt: str | None = None):
|
|
4155
|
+
super().__init__(fmt, datefmt)
|
|
4156
|
+
|
|
4157
|
+
def format(self, record: logging.LogRecord) -> str:
|
|
4158
|
+
# Check if this logger should be dimmed
|
|
4159
|
+
is_dimmed = record.name in self.DIMMED_LOGGERS
|
|
4160
|
+
|
|
4161
|
+
if is_dimmed:
|
|
4162
|
+
# Dim the entire log line for httpx and similar
|
|
4163
|
+
timestamp = self.formatTime(record, self.datefmt)
|
|
4164
|
+
message = record.getMessage()
|
|
4165
|
+
return f"{Colors.DIM}{timestamp} - {record.name} - {record.levelname:8} - {message}{Colors.RESET}"
|
|
4166
|
+
|
|
4167
|
+
# Get color for this level
|
|
4168
|
+
color = self.LEVEL_COLORS.get(record.levelno, Colors.RESET)
|
|
4169
|
+
|
|
4170
|
+
# Color the level name
|
|
4171
|
+
levelname = record.levelname
|
|
4172
|
+
colored_levelname = f"{color}{levelname:8}{Colors.RESET}"
|
|
4173
|
+
|
|
4174
|
+
# Color the timestamp
|
|
4175
|
+
colored_time = f"{Colors.DIM}{self.formatTime(record, self.datefmt)}{Colors.RESET}"
|
|
4176
|
+
|
|
4177
|
+
# Color the logger name
|
|
4178
|
+
colored_name = f"{Colors.BLUE}{record.name}{Colors.RESET}"
|
|
4179
|
+
|
|
4180
|
+
# Get message
|
|
4181
|
+
message = record.getMessage()
|
|
4182
|
+
|
|
4183
|
+
return f"{colored_time} - {colored_name} - {colored_levelname} - {message}"
|
|
4184
|
+
|
|
4185
|
+
|
|
4186
|
+
_warn_once_logged: set[str] = set()
|
|
4187
|
+
|
|
4188
|
+
|
|
4189
|
+
def init_test_logger() -> logging.Logger:
|
|
4190
|
+
"""Initialize a test-specific logger with colored stderr handler and INFO level for tests.
|
|
4191
|
+
|
|
4192
|
+
Uses a named logger instead of root logger to avoid conflicts with pytest-xdist parallel execution.
|
|
4193
|
+
Uses stderr instead of stdout to avoid deadlocks with pytest-xdist output capture.
|
|
4194
|
+
"""
|
|
4195
|
+
logger = logging.getLogger("transformers.training_test")
|
|
4196
|
+
logger.setLevel(logging.INFO)
|
|
4197
|
+
|
|
4198
|
+
# Only add handler if not already present (avoid duplicate handlers on repeated calls)
|
|
4199
|
+
if not logger.handlers:
|
|
4200
|
+
# Use stderr instead of stdout - pytest-xdist captures stdout which can cause deadlocks
|
|
4201
|
+
ch = logging.StreamHandler(sys.stderr)
|
|
4202
|
+
ch.setLevel(logging.INFO)
|
|
4203
|
+
|
|
4204
|
+
# Use colored formatter if terminal supports it, plain otherwise
|
|
4205
|
+
if sys.stderr.isatty():
|
|
4206
|
+
formatter = ColoredFormatter(datefmt="%Y-%m-%d %H:%M:%S")
|
|
4207
|
+
else:
|
|
4208
|
+
formatter = logging.Formatter(
|
|
4209
|
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
|
|
4210
|
+
)
|
|
4211
|
+
|
|
4212
|
+
ch.setFormatter(formatter)
|
|
4213
|
+
logger.addHandler(ch)
|
|
4214
|
+
|
|
4215
|
+
logger.propagate = False # Don't propagate to root logger to avoid duplicate output
|
|
4216
|
+
return logger
|
|
4217
|
+
|
|
4218
|
+
|
|
4219
|
+
def warn_once(logger_instance: logging.Logger, msg: str) -> None:
|
|
4220
|
+
"""Log a warning message only once per unique message.
|
|
4221
|
+
|
|
4222
|
+
Uses a global set to track messages that have already been logged
|
|
4223
|
+
to prevent duplicate warning messages from cluttering the output.
|
|
4224
|
+
|
|
4225
|
+
Args:
|
|
4226
|
+
logger_instance: The logger instance to use for warning.
|
|
4227
|
+
msg: The warning message to log.
|
|
4228
|
+
"""
|
|
4229
|
+
if msg not in _warn_once_logged:
|
|
4230
|
+
logger_instance.warning(msg)
|
|
4231
|
+
_warn_once_logged.add(msg)
|
|
4232
|
+
|
|
4233
|
+
|
|
4234
|
+
# Named tuple for passing memory stats for logging
|
|
4235
|
+
MemoryStats = collections.namedtuple(
|
|
4236
|
+
"MemoryStats",
|
|
4237
|
+
[
|
|
4238
|
+
"rss_gib", # Resident Set Size in GiB
|
|
4239
|
+
"rss_pct", # RSS as percentage of total memory
|
|
4240
|
+
"vms_gib", # Virtual Memory Size in GiB
|
|
4241
|
+
"peak_rss_gib", # Peak RSS in GiB
|
|
4242
|
+
"peak_rss_pct", # Peak RSS as percentage of total memory
|
|
4243
|
+
"available_gib", # Available system memory in GiB
|
|
4244
|
+
"total_gib", # Total system memory in GiB
|
|
4245
|
+
],
|
|
4246
|
+
)
|
|
4247
|
+
|
|
4248
|
+
|
|
4249
|
+
class CPUMemoryMonitor:
|
|
4250
|
+
"""Monitor CPU memory usage for the current process."""
|
|
4251
|
+
|
|
4252
|
+
def __init__(self):
|
|
4253
|
+
self.device_name = "CPU"
|
|
4254
|
+
self._peak_rss = 0
|
|
4255
|
+
self._process = None
|
|
4256
|
+
self.total_memory = 0
|
|
4257
|
+
self.total_memory_gib = 0
|
|
4258
|
+
|
|
4259
|
+
if is_psutil_available():
|
|
4260
|
+
import psutil
|
|
4261
|
+
|
|
4262
|
+
self._process = psutil.Process(os.getpid())
|
|
4263
|
+
mem_info = psutil.virtual_memory()
|
|
4264
|
+
self.total_memory = mem_info.total
|
|
4265
|
+
self.total_memory_gib = self._to_gib(self.total_memory)
|
|
4266
|
+
|
|
4267
|
+
def _to_gib(self, memory_in_bytes: int) -> float:
|
|
4268
|
+
"""Convert bytes to GiB."""
|
|
4269
|
+
return memory_in_bytes / (1024 * 1024 * 1024)
|
|
4270
|
+
|
|
4271
|
+
def _to_pct(self, memory_in_bytes: int) -> float:
|
|
4272
|
+
"""Convert bytes to percentage of total memory."""
|
|
4273
|
+
if self.total_memory == 0:
|
|
4274
|
+
return 0.0
|
|
4275
|
+
return 100.0 * memory_in_bytes / self.total_memory
|
|
4276
|
+
|
|
4277
|
+
def _update_peak(self) -> None:
|
|
4278
|
+
"""Update peak memory tracking."""
|
|
4279
|
+
if self._process is not None:
|
|
4280
|
+
current_rss = self._process.memory_info().rss
|
|
4281
|
+
self._peak_rss = max(self._peak_rss, current_rss)
|
|
4282
|
+
|
|
4283
|
+
def get_stats(self) -> MemoryStats:
|
|
4284
|
+
"""Get current memory statistics."""
|
|
4285
|
+
if not is_psutil_available():
|
|
4286
|
+
return MemoryStats(0, 0, 0, 0, 0, 0, 0)
|
|
4287
|
+
|
|
4288
|
+
import psutil
|
|
4289
|
+
|
|
4290
|
+
self._update_peak()
|
|
4291
|
+
|
|
4292
|
+
mem_info = self._process.memory_info()
|
|
4293
|
+
sys_mem = psutil.virtual_memory()
|
|
4294
|
+
|
|
4295
|
+
return MemoryStats(
|
|
4296
|
+
rss_gib=self._to_gib(mem_info.rss),
|
|
4297
|
+
rss_pct=self._to_pct(mem_info.rss),
|
|
4298
|
+
vms_gib=self._to_gib(mem_info.vms),
|
|
4299
|
+
peak_rss_gib=self._to_gib(self._peak_rss),
|
|
4300
|
+
peak_rss_pct=self._to_pct(self._peak_rss),
|
|
4301
|
+
available_gib=self._to_gib(sys_mem.available),
|
|
4302
|
+
total_gib=self._to_gib(sys_mem.total),
|
|
4303
|
+
)
|
|
4304
|
+
|
|
4305
|
+
def reset_peak_stats(self) -> None:
|
|
4306
|
+
"""Reset peak memory tracking."""
|
|
4307
|
+
if self._process is not None:
|
|
4308
|
+
self._peak_rss = self._process.memory_info().rss
|
|
4309
|
+
|
|
4310
|
+
|
|
4311
|
+
def build_cpu_memory_monitor(logger_instance: logging.Logger | None = None) -> CPUMemoryMonitor:
|
|
4312
|
+
"""Build and initialize a CPU memory monitor.
|
|
4313
|
+
|
|
4314
|
+
Args:
|
|
4315
|
+
logger_instance: Optional logger to log initialization info. If None, no logging is done.
|
|
4316
|
+
|
|
4317
|
+
Returns:
|
|
4318
|
+
CPUMemoryMonitor instance.
|
|
4319
|
+
"""
|
|
4320
|
+
monitor = CPUMemoryMonitor()
|
|
4321
|
+
if logger_instance is not None:
|
|
4322
|
+
if is_psutil_available():
|
|
4323
|
+
logger_instance.info(f"CPU memory monitor initialized: {monitor.total_memory_gib:.2f} GiB total")
|
|
4324
|
+
else:
|
|
4325
|
+
logger_instance.warning("psutil not available, memory monitoring disabled")
|
|
4326
|
+
return monitor
|
|
4327
|
+
|
|
4328
|
+
|
|
4329
|
+
def convert_all_safetensors_to_bins(folder: str):
|
|
4330
|
+
"""Convert all safetensors files into torch bin files, to mimic saving with torch (since we still support loading
|
|
4331
|
+
bin files, but not saving them anymore)"""
|
|
4332
|
+
for file in os.listdir(folder):
|
|
4333
|
+
path = os.path.join(folder, file)
|
|
4334
|
+
if file.endswith(".safetensors"):
|
|
4335
|
+
new_path = path.replace(".safetensors", ".bin").replace("model", "pytorch_model")
|
|
4336
|
+
state_dict = load_file(path)
|
|
4337
|
+
os.remove(path)
|
|
4338
|
+
torch.save(state_dict, new_path)
|
|
4339
|
+
# Adapt the index as well
|
|
4340
|
+
elif file == SAFE_WEIGHTS_INDEX_NAME:
|
|
4341
|
+
new_path = os.path.join(folder, WEIGHTS_INDEX_NAME)
|
|
4342
|
+
with open(path) as f:
|
|
4343
|
+
index = json.loads(f.read())
|
|
4344
|
+
os.remove(path)
|
|
4345
|
+
if "weight_map" in index.keys():
|
|
4346
|
+
weight_map = index["weight_map"]
|
|
4347
|
+
new_weight_map = {}
|
|
4348
|
+
for k, v in weight_map.items():
|
|
4349
|
+
new_weight_map[k] = v.replace(".safetensors", ".bin").replace("model", "pytorch_model")
|
|
4350
|
+
index["weight_map"] = new_weight_map
|
|
4351
|
+
with open(new_path, "w") as f:
|
|
4352
|
+
f.write(json.dumps(index, indent=4))
|
|
4353
|
+
|
|
4354
|
+
|
|
4355
|
+
@contextmanager
|
|
4356
|
+
def force_serialization_as_bin_files():
|
|
4357
|
+
"""Since we don't support saving with torch `.bin` files anymore, but still support loading them, we use this context
|
|
4358
|
+
to easily create the bin files and try to load them back"""
|
|
4359
|
+
try:
|
|
4360
|
+
# Monkey patch the method to save as bin files
|
|
4361
|
+
original_save = PreTrainedModel.save_pretrained
|
|
4362
|
+
|
|
4363
|
+
def new_save(self, save_directory, *args, **kwargs):
|
|
4364
|
+
original_save(self, save_directory, *args, **kwargs)
|
|
4365
|
+
convert_all_safetensors_to_bins(save_directory)
|
|
4366
|
+
|
|
4367
|
+
PreTrainedModel.save_pretrained = new_save
|
|
4368
|
+
|
|
4369
|
+
yield
|
|
4370
|
+
finally:
|
|
4371
|
+
PreTrainedModel.save_pretrained = original_save
|
|
@@ -1114,7 +1114,7 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1114
1114
|
max_length = self.model_max_length
|
|
1115
1115
|
|
|
1116
1116
|
# Test if we have a padding token
|
|
1117
|
-
if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.
|
|
1117
|
+
if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token_id is None or self.pad_token_id < 0):
|
|
1118
1118
|
raise ValueError(
|
|
1119
1119
|
"Asking to pad but the tokenizer does not have a padding token. "
|
|
1120
1120
|
"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
|
|
@@ -1851,8 +1851,9 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1851
1851
|
raise ValueError("`init_inputs` are not supported by `MistralCommonBackend.from_pretrained`.")
|
|
1852
1852
|
|
|
1853
1853
|
# Handle kwargs and AutoTokenizer/AutoProcessor case
|
|
1854
|
+
# These kwargs are passed by AutoTokenizer/AutoProcessor but are not used by MistralCommonBackend
|
|
1854
1855
|
if kwargs and not set(kwargs.keys()).issubset(
|
|
1855
|
-
{"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "_from_auto"}
|
|
1856
|
+
{"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "_from_auto", "subfolder"}
|
|
1856
1857
|
):
|
|
1857
1858
|
raise ValueError(f"Some kwargs in {kwargs} are not supported by `MistralCommonBackend.from_pretrained`.")
|
|
1858
1859
|
|
|
@@ -1986,3 +1987,7 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1986
1987
|
if mode not in [ValidationMode.finetuning, ValidationMode.test]:
|
|
1987
1988
|
raise ValueError(_invalid_mode_msg)
|
|
1988
1989
|
return mode
|
|
1990
|
+
|
|
1991
|
+
|
|
1992
|
+
# Backward compatibility alias for codebases still importing the legacy name.
|
|
1993
|
+
MistralCommonTokenizer = MistralCommonBackend
|