transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +30 -3
- transformers/cli/serve.py +47 -17
- transformers/conversion_mapping.py +15 -2
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +196 -135
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +1 -2
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +1 -2
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/configuration_utils.py +3 -2
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/continuous_api.py +134 -79
- transformers/image_processing_base.py +1 -2
- transformers/integrations/__init__.py +4 -2
- transformers/integrations/accelerate.py +15 -3
- transformers/integrations/aqlm.py +38 -66
- transformers/integrations/awq.py +48 -514
- transformers/integrations/bitnet.py +45 -100
- transformers/integrations/bitsandbytes.py +79 -191
- transformers/integrations/deepspeed.py +1 -0
- transformers/integrations/eetq.py +84 -79
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +236 -193
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +40 -62
- transformers/integrations/hub_kernels.py +42 -3
- transformers/integrations/integration_utils.py +10 -0
- transformers/integrations/mxfp4.py +25 -65
- transformers/integrations/peft.py +7 -29
- transformers/integrations/quanto.py +73 -55
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +44 -90
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +42 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +8 -0
- transformers/modeling_rope_utils.py +30 -6
- transformers/modeling_utils.py +116 -112
- transformers/models/__init__.py +3 -0
- transformers/models/afmoe/modeling_afmoe.py +4 -4
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +2 -0
- transformers/models/altclip/modeling_altclip.py +4 -0
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/modeling_aria.py +4 -4
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/auto/configuration_auto.py +11 -0
- transformers/models/auto/feature_extraction_auto.py +2 -0
- transformers/models/auto/image_processing_auto.py +1 -0
- transformers/models/auto/modeling_auto.py +6 -0
- transformers/models/auto/processing_auto.py +18 -10
- transformers/models/auto/tokenization_auto.py +74 -472
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/bamba/modeling_bamba.py +4 -3
- transformers/models/bark/modeling_bark.py +2 -0
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/big_bird/modeling_big_bird.py +6 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +11 -2
- transformers/models/bitnet/modeling_bitnet.py +4 -4
- transformers/models/blenderbot/modeling_blenderbot.py +5 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
- transformers/models/blip/modeling_blip_text.py +2 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -1
- transformers/models/bloom/modeling_bloom.py +4 -0
- transformers/models/blt/modeling_blt.py +2 -2
- transformers/models/blt/modular_blt.py +2 -2
- transformers/models/bridgetower/modeling_bridgetower.py +5 -1
- transformers/models/bros/modeling_bros.py +4 -0
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +5 -0
- transformers/models/chameleon/modeling_chameleon.py +2 -1
- transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
- transformers/models/clap/modeling_clap.py +5 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +5 -0
- transformers/models/clvp/modeling_clvp.py +5 -0
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +4 -3
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +7 -6
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
- transformers/models/convbert/modeling_convbert.py +6 -0
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/modeling_csm.py +4 -3
- transformers/models/ctrl/modeling_ctrl.py +1 -0
- transformers/models/cvt/modeling_cvt.py +2 -0
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/modeling_d_fine.py +2 -0
- transformers/models/d_fine/modular_d_fine.py +1 -0
- transformers/models/dab_detr/modeling_dab_detr.py +4 -0
- transformers/models/dac/modeling_dac.py +2 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/dbrx/modeling_dbrx.py +2 -2
- transformers/models/deberta/modeling_deberta.py +5 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
- transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
- transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
- transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/modeling_detr.py +5 -0
- transformers/models/dia/modeling_dia.py +4 -3
- transformers/models/dia/modular_dia.py +0 -1
- transformers/models/diffllama/modeling_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +2 -3
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +2 -0
- transformers/models/dots1/modeling_dots1.py +10 -7
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/edgetam/modeling_edgetam.py +1 -1
- transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
- transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
- transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
- transformers/models/efficientnet/modeling_efficientnet.py +2 -0
- transformers/models/emu3/modeling_emu3.py +4 -4
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +14 -2
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
- transformers/models/esm/modeling_esmfold.py +5 -4
- transformers/models/evolla/modeling_evolla.py +4 -4
- transformers/models/exaone4/modeling_exaone4.py +2 -2
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +6 -1
- transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
- transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
- transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
- transformers/models/flaubert/modeling_flaubert.py +7 -0
- transformers/models/flava/modeling_flava.py +6 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
- transformers/models/florence2/modeling_florence2.py +2 -1
- transformers/models/florence2/modular_florence2.py +2 -1
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/processing_fuyu.py +3 -3
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +2 -1
- transformers/models/gemma3/modeling_gemma3.py +14 -84
- transformers/models/gemma3/modular_gemma3.py +12 -81
- transformers/models/gemma3n/modeling_gemma3n.py +18 -209
- transformers/models/gemma3n/modular_gemma3n.py +17 -59
- transformers/models/git/modeling_git.py +2 -0
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/modeling_glm4v.py +3 -3
- transformers/models/glm4v/modular_glm4v.py +6 -4
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
- transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/gpt2/modeling_gpt2.py +5 -1
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
- transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
- transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
- transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
- transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
- transformers/models/gptj/modeling_gptj.py +3 -0
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granitemoe/modeling_granitemoe.py +4 -6
- transformers/models/granitemoe/modular_granitemoe.py +0 -2
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
- transformers/models/groupvit/modeling_groupvit.py +3 -0
- transformers/models/helium/modeling_helium.py +4 -3
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +3 -0
- transformers/models/hubert/modular_hubert.py +1 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
- transformers/models/ibert/modeling_ibert.py +6 -0
- transformers/models/idefics/modeling_idefics.py +5 -21
- transformers/models/imagegpt/modeling_imagegpt.py +2 -1
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/internvl/modeling_internvl.py +2 -4
- transformers/models/internvl/modular_internvl.py +2 -4
- transformers/models/jamba/modeling_jamba.py +2 -2
- transformers/models/janus/modeling_janus.py +1 -0
- transformers/models/janus/modular_janus.py +1 -0
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/kosmos2/modeling_kosmos2.py +1 -0
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +244 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +729 -0
- transformers/models/lasr/modular_lasr.py +569 -0
- transformers/models/lasr/processing_lasr.py +96 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +5 -0
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +3 -0
- transformers/models/lfm2/modeling_lfm2.py +4 -5
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +4 -0
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/modeling_llama4.py +3 -2
- transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
- transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -0
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +4 -0
- transformers/models/mamba/modeling_mamba.py +14 -22
- transformers/models/marian/modeling_marian.py +5 -0
- transformers/models/markuplm/modeling_markuplm.py +4 -0
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/modeling_mask2former.py +2 -0
- transformers/models/maskformer/modeling_maskformer.py +2 -0
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +3 -1
- transformers/models/minimax/modeling_minimax.py +4 -4
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +4 -3
- transformers/models/mistral/modeling_mistral.py +4 -3
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mllama/modeling_mllama.py +2 -2
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/modeling_mobilevit.py +3 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
- transformers/models/modernbert/modeling_modernbert.py +4 -1
- transformers/models/modernbert/modular_modernbert.py +2 -0
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
- transformers/models/moonshine/modeling_moonshine.py +4 -2
- transformers/models/moshi/modeling_moshi.py +5 -2
- transformers/models/mpnet/modeling_mpnet.py +5 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +6 -0
- transformers/models/mt5/modeling_mt5.py +7 -0
- transformers/models/musicgen/modeling_musicgen.py +2 -0
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nemotron/modeling_nemotron.py +4 -2
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nougat/tokenization_nougat.py +11 -59
- transformers/models/nystromformer/modeling_nystromformer.py +6 -0
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +4 -5
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
- transformers/models/oneformer/modeling_oneformer.py +4 -1
- transformers/models/openai/modeling_openai.py +3 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/owlv2/modeling_owlv2.py +4 -0
- transformers/models/owlvit/modeling_owlvit.py +4 -0
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +9 -6
- transformers/models/parakeet/modular_parakeet.py +2 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
- transformers/models/patchtst/modeling_patchtst.py +20 -2
- transformers/models/pegasus/modeling_pegasus.py +5 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
- transformers/models/perceiver/modeling_perceiver.py +8 -0
- transformers/models/persimmon/modeling_persimmon.py +2 -1
- transformers/models/phi/modeling_phi.py +4 -5
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +2 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
- transformers/models/phimoe/modeling_phimoe.py +4 -4
- transformers/models/phimoe/modular_phimoe.py +2 -2
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pixtral/modeling_pixtral.py +2 -1
- transformers/models/plbart/modeling_plbart.py +6 -0
- transformers/models/plbart/modular_plbart.py +2 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/modeling_poolformer.py +2 -0
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +3 -0
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
- transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
- transformers/models/rag/modeling_rag.py +1 -0
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
- transformers/models/reformer/modeling_reformer.py +4 -0
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +6 -1
- transformers/models/rembert/modeling_rembert.py +6 -0
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +11 -2
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/modeling_rt_detr.py +2 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
- transformers/models/rwkv/modeling_rwkv.py +1 -0
- transformers/models/sam2/modeling_sam2.py +2 -2
- transformers/models/sam2/modular_sam2.py +2 -2
- transformers/models/sam2_video/modeling_sam2_video.py +1 -0
- transformers/models/sam2_video/modular_sam2_video.py +1 -0
- transformers/models/sam3/modeling_sam3.py +77 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
- transformers/models/sam3_video/modeling_sam3_video.py +1 -0
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
- transformers/models/seed_oss/modeling_seed_oss.py +2 -2
- transformers/models/segformer/modeling_segformer.py +4 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/siglip2/modeling_siglip2.py +4 -0
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
- transformers/models/speecht5/modeling_speecht5.py +13 -1
- transformers/models/splinter/modeling_splinter.py +3 -0
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +6 -0
- transformers/models/stablelm/modeling_stablelm.py +3 -1
- transformers/models/starcoder2/modeling_starcoder2.py +4 -3
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +2 -0
- transformers/models/swin/modeling_swin.py +4 -0
- transformers/models/swin2sr/modeling_swin2sr.py +2 -0
- transformers/models/swinv2/modeling_swinv2.py +4 -0
- transformers/models/t5/modeling_t5.py +7 -0
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +5 -5
- transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
- transformers/models/table_transformer/modeling_table_transformer.py +4 -0
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +2 -0
- transformers/models/timesfm/modular_timesfm.py +2 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
- transformers/models/trocr/modeling_trocr.py +2 -0
- transformers/models/tvp/modeling_tvp.py +2 -0
- transformers/models/udop/modeling_udop.py +4 -0
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/modeling_umt5.py +7 -0
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/vilt/modeling_vilt.py +6 -0
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +6 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/modeling_vitmatte.py +1 -0
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/modeling_whisper.py +6 -0
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +3 -0
- transformers/models/xglm/modeling_xglm.py +1 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +5 -0
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/yoso/modeling_yoso.py +6 -0
- transformers/models/zamba/modeling_zamba.py +2 -0
- transformers/models/zamba2/modeling_zamba2.py +4 -2
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/modeling_zoedepth.py +1 -0
- transformers/pipelines/__init__.py +2 -3
- transformers/pipelines/base.py +1 -9
- transformers/pipelines/document_question_answering.py +3 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/processing_utils.py +23 -11
- transformers/quantizers/base.py +35 -110
- transformers/quantizers/quantizer_aqlm.py +1 -5
- transformers/quantizers/quantizer_auto_round.py +1 -2
- transformers/quantizers/quantizer_awq.py +17 -81
- transformers/quantizers/quantizer_bitnet.py +3 -8
- transformers/quantizers/quantizer_bnb_4bit.py +13 -110
- transformers/quantizers/quantizer_bnb_8bit.py +16 -92
- transformers/quantizers/quantizer_compressed_tensors.py +1 -5
- transformers/quantizers/quantizer_eetq.py +14 -62
- transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
- transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
- transformers/quantizers/quantizer_fp_quant.py +48 -78
- transformers/quantizers/quantizer_gptq.py +7 -24
- transformers/quantizers/quantizer_higgs.py +40 -54
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +13 -167
- transformers/quantizers/quantizer_quanto.py +20 -64
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +1 -4
- transformers/quantizers/quantizer_torchao.py +23 -202
- transformers/quantizers/quantizer_vptq.py +8 -22
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +297 -36
- transformers/tokenization_mistral_common.py +4 -0
- transformers/tokenization_utils_base.py +113 -222
- transformers/tokenization_utils_tokenizers.py +168 -107
- transformers/trainer.py +28 -31
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +66 -28
- transformers/utils/__init__.py +3 -4
- transformers/utils/auto_docstring.py +1 -0
- transformers/utils/generic.py +27 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +61 -16
- transformers/utils/kernel_config.py +4 -2
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +75 -242
- transformers/video_processing_utils.py +1 -2
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
transformers/testing_utils.py
CHANGED
|
@@ -38,6 +38,7 @@ import types
|
|
|
38
38
|
import unittest
|
|
39
39
|
from collections import UserDict, defaultdict
|
|
40
40
|
from collections.abc import Callable, Generator, Iterable, Iterator, Mapping
|
|
41
|
+
from contextlib import contextmanager
|
|
41
42
|
from dataclasses import MISSING, fields
|
|
42
43
|
from functools import cache, wraps
|
|
43
44
|
from io import StringIO
|
|
@@ -72,13 +73,13 @@ from .integrations.deepspeed import is_deepspeed_available
|
|
|
72
73
|
from .utils import (
|
|
73
74
|
ACCELERATE_MIN_VERSION,
|
|
74
75
|
GGUF_MIN_VERSION,
|
|
76
|
+
SAFE_WEIGHTS_INDEX_NAME,
|
|
75
77
|
TRITON_MIN_VERSION,
|
|
78
|
+
WEIGHTS_INDEX_NAME,
|
|
76
79
|
is_accelerate_available,
|
|
77
80
|
is_apex_available,
|
|
78
81
|
is_apollo_torch_available,
|
|
79
82
|
is_aqlm_available,
|
|
80
|
-
is_auto_awq_available,
|
|
81
|
-
is_auto_gptq_available,
|
|
82
83
|
is_auto_round_available,
|
|
83
84
|
is_av_available,
|
|
84
85
|
is_bitsandbytes_available,
|
|
@@ -88,7 +89,6 @@ from .utils import (
|
|
|
88
89
|
is_cython_available,
|
|
89
90
|
is_decord_available,
|
|
90
91
|
is_detectron2_available,
|
|
91
|
-
is_eetq_available,
|
|
92
92
|
is_essentia_available,
|
|
93
93
|
is_faiss_available,
|
|
94
94
|
is_fbgemm_gpu_available,
|
|
@@ -219,14 +219,19 @@ _COMMON_MODEL_NAMES_MAP = {
|
|
|
219
219
|
|
|
220
220
|
if is_torch_available():
|
|
221
221
|
import torch
|
|
222
|
+
from safetensors.torch import load_file
|
|
223
|
+
|
|
224
|
+
from .modeling_utils import PreTrainedModel
|
|
222
225
|
|
|
223
226
|
IS_ROCM_SYSTEM = torch.version.hip is not None
|
|
224
227
|
IS_CUDA_SYSTEM = torch.version.cuda is not None
|
|
225
228
|
IS_XPU_SYSTEM = getattr(torch.version, "xpu", None) is not None
|
|
229
|
+
IS_NPU_SYSTEM = getattr(torch, "npu", None) is not None
|
|
226
230
|
else:
|
|
227
231
|
IS_ROCM_SYSTEM = False
|
|
228
232
|
IS_CUDA_SYSTEM = False
|
|
229
233
|
IS_XPU_SYSTEM = False
|
|
234
|
+
IS_NPU_SYSTEM = False
|
|
230
235
|
|
|
231
236
|
logger = transformers_logging.get_logger(__name__)
|
|
232
237
|
|
|
@@ -266,6 +271,7 @@ _run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=Fa
|
|
|
266
271
|
_run_staging = parse_flag_from_env("HUGGINGFACE_CO_STAGING", default=False)
|
|
267
272
|
_run_pipeline_tests = parse_flag_from_env("RUN_PIPELINE_TESTS", default=True)
|
|
268
273
|
_run_agent_tests = parse_flag_from_env("RUN_AGENT_TESTS", default=False)
|
|
274
|
+
_run_training_tests = parse_flag_from_env("RUN_TRAINING_TESTS", default=True)
|
|
269
275
|
|
|
270
276
|
|
|
271
277
|
def is_staging_test(test_case):
|
|
@@ -316,6 +322,22 @@ def is_agent_test(test_case):
|
|
|
316
322
|
return pytest.mark.is_agent_test()(test_case)
|
|
317
323
|
|
|
318
324
|
|
|
325
|
+
def is_training_test(test_case):
|
|
326
|
+
"""
|
|
327
|
+
Decorator marking a test as a training test. If RUN_TRAINING_TESTS is set to a falsy value, those tests will be
|
|
328
|
+
skipped.
|
|
329
|
+
"""
|
|
330
|
+
if not _run_training_tests:
|
|
331
|
+
return unittest.skip(reason="test is training test")(test_case)
|
|
332
|
+
else:
|
|
333
|
+
try:
|
|
334
|
+
import pytest # We don't need a hard dependency on pytest in the main library
|
|
335
|
+
except ImportError:
|
|
336
|
+
return test_case
|
|
337
|
+
else:
|
|
338
|
+
return pytest.mark.is_training_test()(test_case)
|
|
339
|
+
|
|
340
|
+
|
|
319
341
|
def slow(test_case):
|
|
320
342
|
"""
|
|
321
343
|
Decorator marking a test as slow.
|
|
@@ -637,6 +659,9 @@ def require_read_token(test_case):
|
|
|
637
659
|
if getattr(attr, "__require_read_token__", False):
|
|
638
660
|
continue
|
|
639
661
|
wrapped = require_read_token(attr)
|
|
662
|
+
if isinstance(inspect.getattr_static(test_case, attr_name), staticmethod):
|
|
663
|
+
# Don't accidentally bind staticmethods to `self`
|
|
664
|
+
wrapped = staticmethod(wrapped)
|
|
640
665
|
setattr(test_case, attr_name, wrapped)
|
|
641
666
|
return test_case
|
|
642
667
|
else:
|
|
@@ -649,10 +674,6 @@ def require_read_token(test_case):
|
|
|
649
674
|
with patch("huggingface_hub.utils._headers.get_token", return_value=token):
|
|
650
675
|
return test_case(*args, **kwargs)
|
|
651
676
|
else: # Allow running locally with the default token env variable
|
|
652
|
-
# dealing with static/class methods and called by `self.xxx`
|
|
653
|
-
if "staticmethod" in inspect.getsource(test_case).strip():
|
|
654
|
-
if len(args) > 0 and isinstance(args[0], unittest.TestCase):
|
|
655
|
-
return test_case(*args[1:], **kwargs)
|
|
656
677
|
return test_case(*args, **kwargs)
|
|
657
678
|
|
|
658
679
|
wrapper.__require_read_token__ = True
|
|
@@ -1239,23 +1260,6 @@ def require_spqr(test_case):
|
|
|
1239
1260
|
return unittest.skipUnless(is_spqr_available(), "test requires spqr")(test_case)
|
|
1240
1261
|
|
|
1241
1262
|
|
|
1242
|
-
def require_eetq(test_case):
|
|
1243
|
-
"""
|
|
1244
|
-
Decorator marking a test that requires eetq
|
|
1245
|
-
"""
|
|
1246
|
-
eetq_available = is_eetq_available()
|
|
1247
|
-
if eetq_available:
|
|
1248
|
-
try:
|
|
1249
|
-
import eetq # noqa: F401
|
|
1250
|
-
except ImportError as exc:
|
|
1251
|
-
if "shard_checkpoint" in str(exc):
|
|
1252
|
-
# EETQ 1.0.0 is currently broken with the latest transformers because it tries to import the removed
|
|
1253
|
-
# shard_checkpoint function, see https://github.com/NetEase-FuXi/EETQ/issues/34.
|
|
1254
|
-
# TODO: Remove once eetq releases a fix and this release is used in CI
|
|
1255
|
-
eetq_available = False
|
|
1256
|
-
return unittest.skipUnless(eetq_available, "test requires eetq")(test_case)
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
1263
|
def require_av(test_case):
|
|
1260
1264
|
"""
|
|
1261
1265
|
Decorator marking a test that requires av
|
|
@@ -1291,13 +1295,11 @@ def require_tensorboard(test_case):
|
|
|
1291
1295
|
return unittest.skipUnless(is_tensorboard_available(), "test requires tensorboard")
|
|
1292
1296
|
|
|
1293
1297
|
|
|
1294
|
-
def
|
|
1298
|
+
def require_gptqmodel(test_case):
|
|
1295
1299
|
"""
|
|
1296
|
-
Decorator for
|
|
1300
|
+
Decorator for gptqmodel dependency
|
|
1297
1301
|
"""
|
|
1298
|
-
return unittest.skipUnless(
|
|
1299
|
-
is_gptqmodel_available() or is_auto_gptq_available(), "test requires gptqmodel or auto-gptq"
|
|
1300
|
-
)(test_case)
|
|
1302
|
+
return unittest.skipUnless(is_gptqmodel_available(), "test requires gptqmodel")(test_case)
|
|
1301
1303
|
|
|
1302
1304
|
|
|
1303
1305
|
def require_hqq(test_case):
|
|
@@ -1307,13 +1309,6 @@ def require_hqq(test_case):
|
|
|
1307
1309
|
return unittest.skipUnless(is_hqq_available(), "test requires hqq")(test_case)
|
|
1308
1310
|
|
|
1309
1311
|
|
|
1310
|
-
def require_auto_awq(test_case):
|
|
1311
|
-
"""
|
|
1312
|
-
Decorator for auto_awq dependency
|
|
1313
|
-
"""
|
|
1314
|
-
return unittest.skipUnless(is_auto_awq_available(), "test requires autoawq")(test_case)
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
1312
|
def require_auto_round(test_case):
|
|
1318
1313
|
"""
|
|
1319
1314
|
Decorator for auto_round dependency
|
|
@@ -3192,6 +3187,8 @@ def get_device_properties() -> DeviceProperties:
|
|
|
3192
3187
|
gen_mask = 0x000000FF00000000
|
|
3193
3188
|
gen = (arch & gen_mask) >> 32
|
|
3194
3189
|
return ("xpu", gen, None)
|
|
3190
|
+
elif IS_NPU_SYSTEM:
|
|
3191
|
+
return ("npu", None, None)
|
|
3195
3192
|
else:
|
|
3196
3193
|
return (torch_device, None, None)
|
|
3197
3194
|
|
|
@@ -4092,3 +4089,267 @@ def write_file(file, content):
|
|
|
4092
4089
|
def read_json_file(file):
|
|
4093
4090
|
with open(file, "r") as fh:
|
|
4094
4091
|
return json.load(fh)
|
|
4092
|
+
|
|
4093
|
+
|
|
4094
|
+
# =============================================================================
|
|
4095
|
+
# Training CI Utilities - Logging and Memory Monitoring
|
|
4096
|
+
# =============================================================================
|
|
4097
|
+
|
|
4098
|
+
|
|
4099
|
+
# ANSI color codes for terminal output
|
|
4100
|
+
class Colors:
|
|
4101
|
+
"""ANSI color codes for terminal output formatting."""
|
|
4102
|
+
|
|
4103
|
+
RESET = "\033[0m"
|
|
4104
|
+
BOLD = "\033[1m"
|
|
4105
|
+
DIM = "\033[2m"
|
|
4106
|
+
|
|
4107
|
+
# Foreground colors
|
|
4108
|
+
RED = "\033[31m"
|
|
4109
|
+
GREEN = "\033[32m"
|
|
4110
|
+
YELLOW = "\033[33m"
|
|
4111
|
+
BLUE = "\033[34m"
|
|
4112
|
+
MAGENTA = "\033[35m"
|
|
4113
|
+
CYAN = "\033[36m"
|
|
4114
|
+
WHITE = "\033[37m"
|
|
4115
|
+
|
|
4116
|
+
# Bright variants
|
|
4117
|
+
BRIGHT_RED = "\033[91m"
|
|
4118
|
+
BRIGHT_GREEN = "\033[92m"
|
|
4119
|
+
BRIGHT_YELLOW = "\033[93m"
|
|
4120
|
+
BRIGHT_BLUE = "\033[94m"
|
|
4121
|
+
BRIGHT_CYAN = "\033[96m"
|
|
4122
|
+
|
|
4123
|
+
|
|
4124
|
+
class ColoredFormatter(logging.Formatter):
|
|
4125
|
+
"""Custom formatter that adds colors based on log level."""
|
|
4126
|
+
|
|
4127
|
+
LEVEL_COLORS = {
|
|
4128
|
+
logging.DEBUG: Colors.DIM + Colors.CYAN,
|
|
4129
|
+
logging.INFO: Colors.WHITE,
|
|
4130
|
+
logging.WARNING: Colors.BRIGHT_YELLOW,
|
|
4131
|
+
logging.ERROR: Colors.BRIGHT_RED,
|
|
4132
|
+
logging.CRITICAL: Colors.BOLD + Colors.BRIGHT_RED,
|
|
4133
|
+
}
|
|
4134
|
+
|
|
4135
|
+
# Loggers that should be dimmed (less important/verbose)
|
|
4136
|
+
DIMMED_LOGGERS = {"httpx", "httpcore", "urllib3", "requests"}
|
|
4137
|
+
|
|
4138
|
+
def __init__(self, fmt: str | None = None, datefmt: str | None = None):
|
|
4139
|
+
super().__init__(fmt, datefmt)
|
|
4140
|
+
|
|
4141
|
+
def format(self, record: logging.LogRecord) -> str:
|
|
4142
|
+
# Check if this logger should be dimmed
|
|
4143
|
+
is_dimmed = record.name in self.DIMMED_LOGGERS
|
|
4144
|
+
|
|
4145
|
+
if is_dimmed:
|
|
4146
|
+
# Dim the entire log line for httpx and similar
|
|
4147
|
+
timestamp = self.formatTime(record, self.datefmt)
|
|
4148
|
+
message = record.getMessage()
|
|
4149
|
+
return f"{Colors.DIM}{timestamp} - {record.name} - {record.levelname:8} - {message}{Colors.RESET}"
|
|
4150
|
+
|
|
4151
|
+
# Get color for this level
|
|
4152
|
+
color = self.LEVEL_COLORS.get(record.levelno, Colors.RESET)
|
|
4153
|
+
|
|
4154
|
+
# Color the level name
|
|
4155
|
+
levelname = record.levelname
|
|
4156
|
+
colored_levelname = f"{color}{levelname:8}{Colors.RESET}"
|
|
4157
|
+
|
|
4158
|
+
# Color the timestamp
|
|
4159
|
+
colored_time = f"{Colors.DIM}{self.formatTime(record, self.datefmt)}{Colors.RESET}"
|
|
4160
|
+
|
|
4161
|
+
# Color the logger name
|
|
4162
|
+
colored_name = f"{Colors.BLUE}{record.name}{Colors.RESET}"
|
|
4163
|
+
|
|
4164
|
+
# Get message
|
|
4165
|
+
message = record.getMessage()
|
|
4166
|
+
|
|
4167
|
+
return f"{colored_time} - {colored_name} - {colored_levelname} - {message}"
|
|
4168
|
+
|
|
4169
|
+
|
|
4170
|
+
_warn_once_logged: set[str] = set()
|
|
4171
|
+
|
|
4172
|
+
|
|
4173
|
+
def init_test_logger() -> logging.Logger:
|
|
4174
|
+
"""Initialize a test-specific logger with colored stderr handler and INFO level for tests.
|
|
4175
|
+
|
|
4176
|
+
Uses a named logger instead of root logger to avoid conflicts with pytest-xdist parallel execution.
|
|
4177
|
+
Uses stderr instead of stdout to avoid deadlocks with pytest-xdist output capture.
|
|
4178
|
+
"""
|
|
4179
|
+
logger = logging.getLogger("transformers.training_test")
|
|
4180
|
+
logger.setLevel(logging.INFO)
|
|
4181
|
+
|
|
4182
|
+
# Only add handler if not already present (avoid duplicate handlers on repeated calls)
|
|
4183
|
+
if not logger.handlers:
|
|
4184
|
+
# Use stderr instead of stdout - pytest-xdist captures stdout which can cause deadlocks
|
|
4185
|
+
ch = logging.StreamHandler(sys.stderr)
|
|
4186
|
+
ch.setLevel(logging.INFO)
|
|
4187
|
+
|
|
4188
|
+
# Use colored formatter if terminal supports it, plain otherwise
|
|
4189
|
+
if sys.stderr.isatty():
|
|
4190
|
+
formatter = ColoredFormatter(datefmt="%Y-%m-%d %H:%M:%S")
|
|
4191
|
+
else:
|
|
4192
|
+
formatter = logging.Formatter(
|
|
4193
|
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
|
|
4194
|
+
)
|
|
4195
|
+
|
|
4196
|
+
ch.setFormatter(formatter)
|
|
4197
|
+
logger.addHandler(ch)
|
|
4198
|
+
|
|
4199
|
+
logger.propagate = False # Don't propagate to root logger to avoid duplicate output
|
|
4200
|
+
return logger
|
|
4201
|
+
|
|
4202
|
+
|
|
4203
|
+
def warn_once(logger_instance: logging.Logger, msg: str) -> None:
|
|
4204
|
+
"""Log a warning message only once per unique message.
|
|
4205
|
+
|
|
4206
|
+
Uses a global set to track messages that have already been logged
|
|
4207
|
+
to prevent duplicate warning messages from cluttering the output.
|
|
4208
|
+
|
|
4209
|
+
Args:
|
|
4210
|
+
logger_instance: The logger instance to use for warning.
|
|
4211
|
+
msg: The warning message to log.
|
|
4212
|
+
"""
|
|
4213
|
+
if msg not in _warn_once_logged:
|
|
4214
|
+
logger_instance.warning(msg)
|
|
4215
|
+
_warn_once_logged.add(msg)
|
|
4216
|
+
|
|
4217
|
+
|
|
4218
|
+
# Named tuple for passing memory stats for logging
|
|
4219
|
+
MemoryStats = collections.namedtuple(
|
|
4220
|
+
"MemoryStats",
|
|
4221
|
+
[
|
|
4222
|
+
"rss_gib", # Resident Set Size in GiB
|
|
4223
|
+
"rss_pct", # RSS as percentage of total memory
|
|
4224
|
+
"vms_gib", # Virtual Memory Size in GiB
|
|
4225
|
+
"peak_rss_gib", # Peak RSS in GiB
|
|
4226
|
+
"peak_rss_pct", # Peak RSS as percentage of total memory
|
|
4227
|
+
"available_gib", # Available system memory in GiB
|
|
4228
|
+
"total_gib", # Total system memory in GiB
|
|
4229
|
+
],
|
|
4230
|
+
)
|
|
4231
|
+
|
|
4232
|
+
|
|
4233
|
+
class CPUMemoryMonitor:
|
|
4234
|
+
"""Monitor CPU memory usage for the current process."""
|
|
4235
|
+
|
|
4236
|
+
def __init__(self):
|
|
4237
|
+
self.device_name = "CPU"
|
|
4238
|
+
self._peak_rss = 0
|
|
4239
|
+
self._process = None
|
|
4240
|
+
self.total_memory = 0
|
|
4241
|
+
self.total_memory_gib = 0
|
|
4242
|
+
|
|
4243
|
+
if is_psutil_available():
|
|
4244
|
+
import psutil
|
|
4245
|
+
|
|
4246
|
+
self._process = psutil.Process(os.getpid())
|
|
4247
|
+
mem_info = psutil.virtual_memory()
|
|
4248
|
+
self.total_memory = mem_info.total
|
|
4249
|
+
self.total_memory_gib = self._to_gib(self.total_memory)
|
|
4250
|
+
|
|
4251
|
+
def _to_gib(self, memory_in_bytes: int) -> float:
|
|
4252
|
+
"""Convert bytes to GiB."""
|
|
4253
|
+
return memory_in_bytes / (1024 * 1024 * 1024)
|
|
4254
|
+
|
|
4255
|
+
def _to_pct(self, memory_in_bytes: int) -> float:
|
|
4256
|
+
"""Convert bytes to percentage of total memory."""
|
|
4257
|
+
if self.total_memory == 0:
|
|
4258
|
+
return 0.0
|
|
4259
|
+
return 100.0 * memory_in_bytes / self.total_memory
|
|
4260
|
+
|
|
4261
|
+
def _update_peak(self) -> None:
|
|
4262
|
+
"""Update peak memory tracking."""
|
|
4263
|
+
if self._process is not None:
|
|
4264
|
+
current_rss = self._process.memory_info().rss
|
|
4265
|
+
self._peak_rss = max(self._peak_rss, current_rss)
|
|
4266
|
+
|
|
4267
|
+
def get_stats(self) -> MemoryStats:
|
|
4268
|
+
"""Get current memory statistics."""
|
|
4269
|
+
if not is_psutil_available():
|
|
4270
|
+
return MemoryStats(0, 0, 0, 0, 0, 0, 0)
|
|
4271
|
+
|
|
4272
|
+
import psutil
|
|
4273
|
+
|
|
4274
|
+
self._update_peak()
|
|
4275
|
+
|
|
4276
|
+
mem_info = self._process.memory_info()
|
|
4277
|
+
sys_mem = psutil.virtual_memory()
|
|
4278
|
+
|
|
4279
|
+
return MemoryStats(
|
|
4280
|
+
rss_gib=self._to_gib(mem_info.rss),
|
|
4281
|
+
rss_pct=self._to_pct(mem_info.rss),
|
|
4282
|
+
vms_gib=self._to_gib(mem_info.vms),
|
|
4283
|
+
peak_rss_gib=self._to_gib(self._peak_rss),
|
|
4284
|
+
peak_rss_pct=self._to_pct(self._peak_rss),
|
|
4285
|
+
available_gib=self._to_gib(sys_mem.available),
|
|
4286
|
+
total_gib=self._to_gib(sys_mem.total),
|
|
4287
|
+
)
|
|
4288
|
+
|
|
4289
|
+
def reset_peak_stats(self) -> None:
|
|
4290
|
+
"""Reset peak memory tracking."""
|
|
4291
|
+
if self._process is not None:
|
|
4292
|
+
self._peak_rss = self._process.memory_info().rss
|
|
4293
|
+
|
|
4294
|
+
|
|
4295
|
+
def build_cpu_memory_monitor(logger_instance: logging.Logger | None = None) -> CPUMemoryMonitor:
|
|
4296
|
+
"""Build and initialize a CPU memory monitor.
|
|
4297
|
+
|
|
4298
|
+
Args:
|
|
4299
|
+
logger_instance: Optional logger to log initialization info. If None, no logging is done.
|
|
4300
|
+
|
|
4301
|
+
Returns:
|
|
4302
|
+
CPUMemoryMonitor instance.
|
|
4303
|
+
"""
|
|
4304
|
+
monitor = CPUMemoryMonitor()
|
|
4305
|
+
if logger_instance is not None:
|
|
4306
|
+
if is_psutil_available():
|
|
4307
|
+
logger_instance.info(f"CPU memory monitor initialized: {monitor.total_memory_gib:.2f} GiB total")
|
|
4308
|
+
else:
|
|
4309
|
+
logger_instance.warning("psutil not available, memory monitoring disabled")
|
|
4310
|
+
return monitor
|
|
4311
|
+
|
|
4312
|
+
|
|
4313
|
+
def convert_all_safetensors_to_bins(folder: str):
|
|
4314
|
+
"""Convert all safetensors files into torch bin files, to mimic saving with torch (since we still support loading
|
|
4315
|
+
bin files, but not saving them anymore)"""
|
|
4316
|
+
for file in os.listdir(folder):
|
|
4317
|
+
path = os.path.join(folder, file)
|
|
4318
|
+
if file.endswith(".safetensors"):
|
|
4319
|
+
new_path = path.replace(".safetensors", ".bin").replace("model", "pytorch_model")
|
|
4320
|
+
state_dict = load_file(path)
|
|
4321
|
+
os.remove(path)
|
|
4322
|
+
torch.save(state_dict, new_path)
|
|
4323
|
+
# Adapt the index as well
|
|
4324
|
+
elif file == SAFE_WEIGHTS_INDEX_NAME:
|
|
4325
|
+
new_path = os.path.join(folder, WEIGHTS_INDEX_NAME)
|
|
4326
|
+
with open(path) as f:
|
|
4327
|
+
index = json.loads(f.read())
|
|
4328
|
+
os.remove(path)
|
|
4329
|
+
if "weight_map" in index.keys():
|
|
4330
|
+
weight_map = index["weight_map"]
|
|
4331
|
+
new_weight_map = {}
|
|
4332
|
+
for k, v in weight_map.items():
|
|
4333
|
+
new_weight_map[k] = v.replace(".safetensors", ".bin").replace("model", "pytorch_model")
|
|
4334
|
+
index["weight_map"] = new_weight_map
|
|
4335
|
+
with open(new_path, "w") as f:
|
|
4336
|
+
f.write(json.dumps(index, indent=4))
|
|
4337
|
+
|
|
4338
|
+
|
|
4339
|
+
@contextmanager
|
|
4340
|
+
def force_serialization_as_bin_files():
|
|
4341
|
+
"""Since we don't support saving with torch `.bin` files anymore, but still support loading them, we use this context
|
|
4342
|
+
to easily create the bin files and try to load them back"""
|
|
4343
|
+
try:
|
|
4344
|
+
# Monkey patch the method to save as bin files
|
|
4345
|
+
original_save = PreTrainedModel.save_pretrained
|
|
4346
|
+
|
|
4347
|
+
def new_save(self, save_directory, *args, **kwargs):
|
|
4348
|
+
original_save(self, save_directory, *args, **kwargs)
|
|
4349
|
+
convert_all_safetensors_to_bins(save_directory)
|
|
4350
|
+
|
|
4351
|
+
PreTrainedModel.save_pretrained = new_save
|
|
4352
|
+
|
|
4353
|
+
yield
|
|
4354
|
+
finally:
|
|
4355
|
+
PreTrainedModel.save_pretrained = original_save
|
|
@@ -1986,3 +1986,7 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1986
1986
|
if mode not in [ValidationMode.finetuning, ValidationMode.test]:
|
|
1987
1987
|
raise ValueError(_invalid_mode_msg)
|
|
1988
1988
|
return mode
|
|
1989
|
+
|
|
1990
|
+
|
|
1991
|
+
# Backward compatibility alias for codebases still importing the legacy name.
|
|
1992
|
+
MistralCommonTokenizer = MistralCommonBackend
|