transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +20 -1
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +68 -5
- transformers/core_model_loading.py +201 -35
- transformers/dependency_versions_table.py +1 -1
- transformers/feature_extraction_utils.py +54 -22
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +162 -122
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +101 -64
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +2 -12
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +12 -0
- transformers/integrations/accelerate.py +44 -111
- transformers/integrations/aqlm.py +3 -5
- transformers/integrations/awq.py +2 -5
- transformers/integrations/bitnet.py +5 -8
- transformers/integrations/bitsandbytes.py +16 -15
- transformers/integrations/deepspeed.py +18 -3
- transformers/integrations/eetq.py +3 -5
- transformers/integrations/fbgemm_fp8.py +1 -1
- transformers/integrations/finegrained_fp8.py +6 -16
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/higgs.py +2 -5
- transformers/integrations/hub_kernels.py +23 -5
- transformers/integrations/integration_utils.py +35 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +4 -10
- transformers/integrations/peft.py +5 -0
- transformers/integrations/quanto.py +5 -2
- transformers/integrations/spqr.py +3 -5
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/vptq.py +3 -5
- transformers/modeling_gguf_pytorch_utils.py +66 -19
- transformers/modeling_rope_utils.py +78 -81
- transformers/modeling_utils.py +583 -503
- transformers/models/__init__.py +19 -0
- transformers/models/afmoe/modeling_afmoe.py +7 -16
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/align/modeling_align.py +12 -6
- transformers/models/altclip/modeling_altclip.py +7 -3
- transformers/models/apertus/modeling_apertus.py +4 -2
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +8 -4
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +27 -0
- transformers/models/auto/feature_extraction_auto.py +7 -3
- transformers/models/auto/image_processing_auto.py +4 -2
- transformers/models/auto/modeling_auto.py +31 -0
- transformers/models/auto/processing_auto.py +4 -0
- transformers/models/auto/tokenization_auto.py +132 -153
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +18 -19
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +9 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +3 -0
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
- transformers/models/bit/modeling_bit.py +5 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blenderbot/modeling_blenderbot.py +7 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +8 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -0
- transformers/models/bloom/modeling_bloom.py +13 -44
- transformers/models/blt/modeling_blt.py +162 -2
- transformers/models/blt/modular_blt.py +168 -3
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +6 -0
- transformers/models/bros/modeling_bros.py +8 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/canine/modeling_canine.py +6 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +9 -4
- transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +25 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clipseg/modeling_clipseg.py +4 -0
- transformers/models/clvp/modeling_clvp.py +14 -3
- transformers/models/code_llama/tokenization_code_llama.py +1 -1
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
- transformers/models/convbert/modeling_convbert.py +3 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +3 -1
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +14 -2
- transformers/models/cvt/modeling_cvt.py +5 -1
- transformers/models/cwm/modeling_cwm.py +1 -1
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +46 -39
- transformers/models/d_fine/modular_d_fine.py +15 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +1 -1
- transformers/models/dac/modeling_dac.py +4 -4
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +1 -1
- transformers/models/deberta/modeling_deberta.py +2 -0
- transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
- transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
- transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +8 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +12 -1
- transformers/models/dia/modular_dia.py +11 -0
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +3 -3
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
- transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +16 -12
- transformers/models/dots1/modeling_dots1.py +14 -5
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +5 -2
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +5 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +8 -2
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt_fast.py +46 -14
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +6 -1
- transformers/models/evolla/modeling_evolla.py +9 -1
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/falcon/modeling_falcon.py +3 -3
- transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
- transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
- transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +14 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +4 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
- transformers/models/florence2/modeling_florence2.py +20 -3
- transformers/models/florence2/modular_florence2.py +13 -0
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +16 -0
- transformers/models/gemma/modeling_gemma.py +10 -12
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma2/modular_gemma2.py +1 -1
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +28 -7
- transformers/models/gemma3/modular_gemma3.py +26 -6
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +47 -9
- transformers/models/gemma3n/modular_gemma3n.py +51 -9
- transformers/models/git/modeling_git.py +181 -126
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +15 -5
- transformers/models/glm4v/modular_glm4v.py +11 -3
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
- transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +8 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
- transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
- transformers/models/gptj/modeling_gptj.py +15 -6
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +2 -3
- transformers/models/granitemoe/modular_granitemoe.py +1 -2
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
- transformers/models/groupvit/modeling_groupvit.py +6 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
- transformers/models/hubert/modeling_hubert.py +4 -0
- transformers/models/hubert/modular_hubert.py +4 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +16 -0
- transformers/models/idefics/modeling_idefics.py +10 -0
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +9 -2
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +11 -8
- transformers/models/internvl/modular_internvl.py +5 -9
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +24 -19
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +15 -7
- transformers/models/janus/modular_janus.py +16 -7
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +14 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/configuration_lasr.py +4 -0
- transformers/models/lasr/modeling_lasr.py +3 -2
- transformers/models/lasr/modular_lasr.py +8 -1
- transformers/models/lasr/processing_lasr.py +0 -2
- transformers/models/layoutlm/modeling_layoutlm.py +5 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +18 -0
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lilt/modeling_lilt.py +19 -15
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +8 -4
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
- transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
- transformers/models/longt5/modeling_longt5.py +0 -4
- transformers/models/m2m_100/modeling_m2m_100.py +10 -0
- transformers/models/mamba/modeling_mamba.py +2 -1
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +3 -0
- transformers/models/markuplm/modeling_markuplm.py +5 -8
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +9 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +9 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mimi/modeling_mimi.py +25 -4
- transformers/models/minimax/modeling_minimax.py +16 -3
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +12 -4
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +13 -2
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +4 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
- transformers/models/modernbert/modeling_modernbert.py +12 -1
- transformers/models/modernbert/modular_modernbert.py +12 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
- transformers/models/moonshine/modeling_moonshine.py +1 -1
- transformers/models/moshi/modeling_moshi.py +21 -51
- transformers/models/mpnet/modeling_mpnet.py +2 -0
- transformers/models/mra/modeling_mra.py +4 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +0 -10
- transformers/models/musicgen/modeling_musicgen.py +5 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +1 -1
- transformers/models/nemotron/modeling_nemotron.py +3 -3
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +11 -16
- transformers/models/nystromformer/modeling_nystromformer.py +7 -0
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/olmoe/modeling_olmoe.py +12 -4
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +7 -38
- transformers/models/openai/modeling_openai.py +12 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +7 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +7 -3
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/modeling_parakeet.py +5 -0
- transformers/models/parakeet/modular_parakeet.py +5 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
- transformers/models/patchtst/modeling_patchtst.py +5 -4
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/models/pe_audio/processing_pe_audio.py +24 -0
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +3 -0
- transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +5 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +12 -4
- transformers/models/phimoe/modular_phimoe.py +1 -1
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +1 -1
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +7 -0
- transformers/models/plbart/modular_plbart.py +6 -0
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +11 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prophetnet/modeling_prophetnet.py +2 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
- transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +7 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
- transformers/models/reformer/modeling_reformer.py +9 -1
- transformers/models/regnet/modeling_regnet.py +4 -0
- transformers/models/rembert/modeling_rembert.py +7 -1
- transformers/models/resnet/modeling_resnet.py +8 -3
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +4 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +1 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +5 -1
- transformers/models/sam2/modular_sam2.py +5 -1
- transformers/models/sam2_video/modeling_sam2_video.py +51 -43
- transformers/models/sam2_video/modular_sam2_video.py +31 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +23 -0
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +3 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +2 -2
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +63 -41
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
- transformers/models/speecht5/modeling_speecht5.py +28 -0
- transformers/models/splinter/modeling_splinter.py +9 -3
- transformers/models/squeezebert/modeling_squeezebert.py +2 -0
- transformers/models/stablelm/modeling_stablelm.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/swiftformer/modeling_swiftformer.py +4 -0
- transformers/models/swin/modeling_swin.py +16 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +49 -33
- transformers/models/swinv2/modeling_swinv2.py +41 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +1 -7
- transformers/models/t5gemma/modeling_t5gemma.py +1 -1
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +1 -1
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +12 -0
- transformers/models/timesfm/modular_timesfm.py +12 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
- transformers/models/trocr/modeling_trocr.py +1 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +4 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +3 -7
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +0 -6
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +7 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/visual_bert/modeling_visual_bert.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +4 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +5 -3
- transformers/models/x_clip/modeling_x_clip.py +2 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +10 -0
- transformers/models/xlm/modeling_xlm.py +13 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +4 -1
- transformers/models/zamba/modeling_zamba.py +2 -1
- transformers/models/zamba2/modeling_zamba2.py +3 -2
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +7 -0
- transformers/pipelines/__init__.py +9 -6
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/document_question_answering.py +1 -1
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +127 -56
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +9 -64
- transformers/quantizers/quantizer_aqlm.py +1 -18
- transformers/quantizers/quantizer_auto_round.py +1 -10
- transformers/quantizers/quantizer_awq.py +3 -8
- transformers/quantizers/quantizer_bitnet.py +1 -6
- transformers/quantizers/quantizer_bnb_4bit.py +9 -49
- transformers/quantizers/quantizer_bnb_8bit.py +9 -19
- transformers/quantizers/quantizer_compressed_tensors.py +1 -4
- transformers/quantizers/quantizer_eetq.py +2 -12
- transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
- transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
- transformers/quantizers/quantizer_fp_quant.py +4 -4
- transformers/quantizers/quantizer_gptq.py +1 -4
- transformers/quantizers/quantizer_higgs.py +2 -6
- transformers/quantizers/quantizer_mxfp4.py +2 -28
- transformers/quantizers/quantizer_quanto.py +14 -14
- transformers/quantizers/quantizer_spqr.py +3 -8
- transformers/quantizers/quantizer_torchao.py +28 -124
- transformers/quantizers/quantizer_vptq.py +1 -10
- transformers/testing_utils.py +28 -12
- transformers/tokenization_mistral_common.py +3 -2
- transformers/tokenization_utils_base.py +3 -2
- transformers/tokenization_utils_tokenizers.py +25 -2
- transformers/trainer.py +24 -2
- transformers/trainer_callback.py +8 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/training_args.py +8 -10
- transformers/utils/__init__.py +4 -0
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +34 -25
- transformers/utils/generic.py +20 -0
- transformers/utils/import_utils.py +51 -9
- transformers/utils/kernel_config.py +71 -18
- transformers/utils/quantization_config.py +8 -8
- transformers/video_processing_utils.py +16 -12
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
transformers/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
|
|
19
19
|
# in the namespace without actually importing anything (and especially none of the backends).
|
|
20
20
|
|
|
21
|
-
__version__ = "5.0.
|
|
21
|
+
__version__ = "5.0.0rc2"
|
|
22
22
|
|
|
23
23
|
import importlib
|
|
24
24
|
import sys
|
|
@@ -36,6 +36,7 @@ from .utils import (
|
|
|
36
36
|
is_librosa_available,
|
|
37
37
|
is_mistral_common_available,
|
|
38
38
|
is_mlx_available,
|
|
39
|
+
is_numba_available,
|
|
39
40
|
is_pretty_midi_available,
|
|
40
41
|
)
|
|
41
42
|
|
|
@@ -266,6 +267,7 @@ _import_structure = {
|
|
|
266
267
|
],
|
|
267
268
|
"video_utils": [],
|
|
268
269
|
"utils.kernel_config": ["KernelConfig"],
|
|
270
|
+
"utils.import_utils": ["requires_backends"],
|
|
269
271
|
}
|
|
270
272
|
|
|
271
273
|
# tokenizers-backed objects
|
|
@@ -439,6 +441,15 @@ else:
|
|
|
439
441
|
"convert_and_export_with_cache",
|
|
440
442
|
]
|
|
441
443
|
|
|
444
|
+
_import_structure["core_model_loading"] = [
|
|
445
|
+
"Chunk",
|
|
446
|
+
"Concatenate",
|
|
447
|
+
"ConversionOps",
|
|
448
|
+
"MergeModulelist",
|
|
449
|
+
"PermuteForRope",
|
|
450
|
+
"SplitModulelist",
|
|
451
|
+
"WeightConverter",
|
|
452
|
+
]
|
|
442
453
|
_import_structure["modeling_flash_attention_utils"] = []
|
|
443
454
|
_import_structure["modeling_layers"] = ["GradientCheckpointingLayer"]
|
|
444
455
|
_import_structure["modeling_outputs"] = []
|
|
@@ -492,6 +503,13 @@ if TYPE_CHECKING:
|
|
|
492
503
|
from .configuration_utils import PretrainedConfig as PretrainedConfig
|
|
493
504
|
from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS as SLOW_TO_FAST_CONVERTERS
|
|
494
505
|
from .convert_slow_tokenizer import convert_slow_tokenizer as convert_slow_tokenizer
|
|
506
|
+
from .core_model_loading import Chunk as Chunk
|
|
507
|
+
from .core_model_loading import Concatenate as Concatenate
|
|
508
|
+
from .core_model_loading import ConversionOps as ConversionOps
|
|
509
|
+
from .core_model_loading import MergeModulelist as MergeModulelist
|
|
510
|
+
from .core_model_loading import PermuteForRope as PermuteForRope
|
|
511
|
+
from .core_model_loading import SplitModulelist as SplitModulelist
|
|
512
|
+
from .core_model_loading import WeightConverter as WeightConverter
|
|
495
513
|
|
|
496
514
|
# Data
|
|
497
515
|
from .data import DataProcessor as DataProcessor
|
|
@@ -750,6 +768,7 @@ if TYPE_CHECKING:
|
|
|
750
768
|
from .utils import is_torch_npu_available as is_torch_npu_available
|
|
751
769
|
from .utils import is_torch_xla_available as is_torch_xla_available
|
|
752
770
|
from .utils import is_torch_xpu_available as is_torch_xpu_available
|
|
771
|
+
from .utils.import_utils import requires_backends
|
|
753
772
|
from .utils.kernel_config import KernelConfig as KernelConfig
|
|
754
773
|
|
|
755
774
|
# Quantization config
|
transformers/activations.py
CHANGED
|
@@ -205,7 +205,7 @@ class LaplaceActivation(nn.Module):
|
|
|
205
205
|
|
|
206
206
|
class ReLUSquaredActivation(nn.Module):
|
|
207
207
|
"""
|
|
208
|
-
Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.
|
|
208
|
+
Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.08668
|
|
209
209
|
"""
|
|
210
210
|
|
|
211
211
|
def forward(self, input):
|
transformers/audio_utils.py
CHANGED
|
@@ -166,7 +166,6 @@ def load_audio_as(
|
|
|
166
166
|
- `dict`: Dictionary with 'data' (base64 encoded audio data) and 'format' keys (if return_format="dict")
|
|
167
167
|
- `io.BytesIO`: BytesIO object containing audio data (if return_format="buffer")
|
|
168
168
|
"""
|
|
169
|
-
# TODO: @eustlb, we actually don't need librosa but soxr is installed with librosa
|
|
170
169
|
requires_backends(load_audio_as, ["librosa"])
|
|
171
170
|
|
|
172
171
|
if return_format not in ["base64", "dict", "buffer"]:
|
transformers/cache_utils.py
CHANGED
|
@@ -37,7 +37,7 @@ class CacheLayerMixin(ABC):
|
|
|
37
37
|
return f"{self.__class__.__name__}"
|
|
38
38
|
|
|
39
39
|
@abstractmethod
|
|
40
|
-
def lazy_initialization(self, key_states: torch.Tensor): ...
|
|
40
|
+
def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None: ...
|
|
41
41
|
|
|
42
42
|
@abstractmethod
|
|
43
43
|
def update(
|
|
@@ -89,7 +89,7 @@ class DynamicLayer(CacheLayerMixin):
|
|
|
89
89
|
|
|
90
90
|
is_sliding = False
|
|
91
91
|
|
|
92
|
-
def lazy_initialization(self, key_states: torch.Tensor):
|
|
92
|
+
def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
|
|
93
93
|
self.dtype, self.device = key_states.dtype, key_states.device
|
|
94
94
|
self.keys = torch.tensor([], dtype=self.dtype, device=self.device)
|
|
95
95
|
self.values = torch.tensor([], dtype=self.dtype, device=self.device)
|
|
@@ -114,7 +114,7 @@ class DynamicLayer(CacheLayerMixin):
|
|
|
114
114
|
"""
|
|
115
115
|
# Lazy initialization
|
|
116
116
|
if not self.is_initialized:
|
|
117
|
-
self.lazy_initialization(key_states)
|
|
117
|
+
self.lazy_initialization(key_states, value_states)
|
|
118
118
|
|
|
119
119
|
self.keys = torch.cat([self.keys, key_states], dim=-2)
|
|
120
120
|
self.values = torch.cat([self.values, value_states], dim=-2)
|
|
@@ -178,8 +178,8 @@ class DynamicSlidingWindowLayer(DynamicLayer):
|
|
|
178
178
|
self.cumulative_length = 0
|
|
179
179
|
self._sliding_window_tensor = torch.tensor(self.sliding_window, dtype=torch.long)
|
|
180
180
|
|
|
181
|
-
def lazy_initialization(self, key_states: torch.Tensor) -> None:
|
|
182
|
-
super().lazy_initialization(key_states)
|
|
181
|
+
def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
|
|
182
|
+
super().lazy_initialization(key_states, value_states)
|
|
183
183
|
self._sliding_window_tensor = self._sliding_window_tensor.to(self.device)
|
|
184
184
|
|
|
185
185
|
def update(
|
|
@@ -201,7 +201,7 @@ class DynamicSlidingWindowLayer(DynamicLayer):
|
|
|
201
201
|
"""
|
|
202
202
|
# Lazy initialization
|
|
203
203
|
if not self.is_initialized:
|
|
204
|
-
self.lazy_initialization(key_states)
|
|
204
|
+
self.lazy_initialization(key_states, value_states)
|
|
205
205
|
|
|
206
206
|
self.cumulative_length += key_states.shape[-2]
|
|
207
207
|
|
|
@@ -267,7 +267,7 @@ class StaticLayer(CacheLayerMixin):
|
|
|
267
267
|
super().__init__()
|
|
268
268
|
self.max_cache_len = max_cache_len
|
|
269
269
|
|
|
270
|
-
def lazy_initialization(self, key_states: torch.Tensor):
|
|
270
|
+
def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
|
|
271
271
|
"""
|
|
272
272
|
Lazy initialization of the keys and values tensors. This allows to get all properties (dtype, device,
|
|
273
273
|
num_heads in case of TP etc...) at runtime directly, which is extremely practical as it avoids moving
|
|
@@ -281,16 +281,18 @@ class StaticLayer(CacheLayerMixin):
|
|
|
281
281
|
i.e. `mode="reduce-overhead"` is known to fail). But it will in general work correctly, and prefill should
|
|
282
282
|
not be compiled anyway for performances!
|
|
283
283
|
"""
|
|
284
|
-
self.max_batch_size, self.num_heads, _, self.head_dim = key_states.shape
|
|
285
284
|
self.dtype, self.device = key_states.dtype, key_states.device
|
|
285
|
+
self.max_batch_size, self.num_heads = key_states.shape[:2]
|
|
286
|
+
self.v_head_dim = value_states.shape[-1]
|
|
287
|
+
self.k_head_dim = key_states.shape[-1]
|
|
286
288
|
|
|
287
289
|
self.keys = torch.zeros(
|
|
288
|
-
(self.max_batch_size, self.num_heads, self.max_cache_len, self.
|
|
290
|
+
(self.max_batch_size, self.num_heads, self.max_cache_len, self.k_head_dim),
|
|
289
291
|
dtype=self.dtype,
|
|
290
292
|
device=self.device,
|
|
291
293
|
)
|
|
292
294
|
self.values = torch.zeros(
|
|
293
|
-
(self.max_batch_size, self.num_heads, self.max_cache_len, self.
|
|
295
|
+
(self.max_batch_size, self.num_heads, self.max_cache_len, self.v_head_dim),
|
|
294
296
|
dtype=self.dtype,
|
|
295
297
|
device=self.device,
|
|
296
298
|
)
|
|
@@ -323,7 +325,7 @@ class StaticLayer(CacheLayerMixin):
|
|
|
323
325
|
"""
|
|
324
326
|
# Lazy initialization
|
|
325
327
|
if not self.is_initialized:
|
|
326
|
-
self.lazy_initialization(key_states)
|
|
328
|
+
self.lazy_initialization(key_states, value_states)
|
|
327
329
|
|
|
328
330
|
# Some old models give None for `cache_position` or even omit passing `cache_kwargs` when used as cross-attention,
|
|
329
331
|
# in which case we should copy the whole Layer (key_states.shape[-2] == self.max_cache_len)
|
|
@@ -398,7 +400,7 @@ class StaticSlidingWindowLayer(StaticLayer):
|
|
|
398
400
|
"""
|
|
399
401
|
# Lazy initialization
|
|
400
402
|
if not self.is_initialized:
|
|
401
|
-
self.lazy_initialization(key_states)
|
|
403
|
+
self.lazy_initialization(key_states, value_states)
|
|
402
404
|
|
|
403
405
|
# Some old models give None for `cache_position` or even omit passing `cache_kwargs` when used as cross-attention,
|
|
404
406
|
# in which case we should copy the whole Layer (key_states.shape[-2] == self.max_cache_len)
|
|
@@ -533,7 +535,7 @@ class QuantizedLayer(DynamicLayer):
|
|
|
533
535
|
|
|
534
536
|
# Lazy initialization
|
|
535
537
|
if not self.is_initialized:
|
|
536
|
-
self.lazy_initialization(key_states)
|
|
538
|
+
self.lazy_initialization(key_states, value_states)
|
|
537
539
|
self._quantized_keys = self._quantize(key_states.contiguous(), axis=self.axis_key)
|
|
538
540
|
self._quantized_values = self._quantize(value_states.contiguous(), axis=self.axis_value)
|
|
539
541
|
return key_states, value_states
|
|
@@ -795,10 +797,10 @@ class Cache:
|
|
|
795
797
|
# Note that the initialization needs all dimensions (except -2), as well as device and dtype, so we use
|
|
796
798
|
# this fake tensor approach. It has size 0 on the -2 dimension, so it does not allocate any data (it only
|
|
797
799
|
# creates an empty tensor with correct shape, dtype and device), which is very efficient and practical
|
|
798
|
-
|
|
800
|
+
fake_kv_tensor = torch.zeros((batch_size, num_heads, 0, head_dim), dtype=dtype, device=device)
|
|
799
801
|
# Init all layers
|
|
800
802
|
for layer in self.layers:
|
|
801
|
-
layer.lazy_initialization(
|
|
803
|
+
layer.lazy_initialization(fake_kv_tensor, fake_kv_tensor)
|
|
802
804
|
|
|
803
805
|
def get_seq_length(self, layer_idx: int = 0) -> int:
|
|
804
806
|
"""Returns the sequence length of the cache for the given layer."""
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
|
|
17
17
|
import copy
|
|
18
18
|
import json
|
|
19
|
+
import math
|
|
19
20
|
import os
|
|
20
21
|
import warnings
|
|
21
22
|
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
|
|
@@ -25,6 +26,7 @@ from packaging import version
|
|
|
25
26
|
|
|
26
27
|
from . import __version__
|
|
27
28
|
from .dynamic_module_utils import custom_object_save
|
|
29
|
+
from .generation.configuration_utils import GenerationConfig
|
|
28
30
|
from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
|
|
29
31
|
from .modeling_rope_utils import RotaryEmbeddingConfigMixin
|
|
30
32
|
from .utils import (
|
|
@@ -49,6 +51,9 @@ logger = logging.get_logger(__name__)
|
|
|
49
51
|
# type hinting: specifying the type of config class that inherits from PreTrainedConfig
|
|
50
52
|
SpecificPreTrainedConfigType = TypeVar("SpecificPreTrainedConfigType", bound="PreTrainedConfig")
|
|
51
53
|
|
|
54
|
+
_FLOAT_TAG_KEY = "__float__"
|
|
55
|
+
_FLOAT_TAG_VALUES = {"Infinity": float("inf"), "-Infinity": float("-inf"), "NaN": float("nan")}
|
|
56
|
+
|
|
52
57
|
|
|
53
58
|
class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
54
59
|
# no-format
|
|
@@ -120,9 +125,6 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
120
125
|
Whether cross-attention layers should be added to the model. Note, this option is only relevant for models
|
|
121
126
|
that can be used as decoder models within the [`EncoderDecoderModel`] class, which consists of all models
|
|
122
127
|
in `AUTO_MODELS_FOR_CAUSAL_LM`.
|
|
123
|
-
tie_encoder_decoder (`bool`, *optional*, defaults to `False`):
|
|
124
|
-
Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder
|
|
125
|
-
and decoder model to have the exact same parameter names.
|
|
126
128
|
chunk_size_feed_forward (`int`, *optional*, defaults to `0`):
|
|
127
129
|
The chunk size of all feed forward layers in the residual attention blocks. A chunk size of `0` means that
|
|
128
130
|
the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes `n` <
|
|
@@ -212,7 +214,6 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
212
214
|
is_decoder: bool = False,
|
|
213
215
|
cross_attention_hidden_size: Optional[int] = None,
|
|
214
216
|
add_cross_attention: bool = False,
|
|
215
|
-
tie_encoder_decoder: bool = False,
|
|
216
217
|
# Fine-tuning task arguments
|
|
217
218
|
architectures: Optional[list[str]] = None,
|
|
218
219
|
finetuning_task: Optional[str] = None,
|
|
@@ -276,6 +277,10 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
276
277
|
self._output_attentions = output_attentions # has public property
|
|
277
278
|
|
|
278
279
|
# Less common kwargs, only used by some models
|
|
280
|
+
if "tie_encoder_decoder" in kwargs:
|
|
281
|
+
tie_encoder_decoder = kwargs.pop("tie_encoder_decoder")
|
|
282
|
+
tie_word_embeddings = tie_encoder_decoder or tie_word_embeddings
|
|
283
|
+
|
|
279
284
|
self.tie_word_embeddings = tie_word_embeddings
|
|
280
285
|
self.chunk_size_feed_forward = chunk_size_feed_forward
|
|
281
286
|
|
|
@@ -284,7 +289,6 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
284
289
|
self.is_decoder = is_decoder # used in encoder-decoder models to differentiate encoder from decoder
|
|
285
290
|
self.cross_attention_hidden_size = cross_attention_hidden_size
|
|
286
291
|
self.add_cross_attention = add_cross_attention
|
|
287
|
-
self.tie_encoder_decoder = tie_encoder_decoder
|
|
288
292
|
|
|
289
293
|
# Fine-tuning task attributes
|
|
290
294
|
self.architectures = architectures
|
|
@@ -310,7 +314,7 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
310
314
|
self.decoder_start_token_id = decoder_start_token_id
|
|
311
315
|
|
|
312
316
|
# Parameters for sequence generation saved in the config are popped instead of loading them.
|
|
313
|
-
for parameter_name in
|
|
317
|
+
for parameter_name in GenerationConfig._get_default_generation_params().keys():
|
|
314
318
|
kwargs.pop(parameter_name, None)
|
|
315
319
|
|
|
316
320
|
# Name or path to the pretrained checkpoint
|
|
@@ -320,6 +324,9 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
320
324
|
# Attention implementation to use, if relevant (it sets it recursively on sub-configs)
|
|
321
325
|
self._attn_implementation = kwargs.pop("attn_implementation", None)
|
|
322
326
|
|
|
327
|
+
# Experts implementation to use, if relevant (it sets it recursively on sub-configs)
|
|
328
|
+
self._experts_implementation = kwargs.pop("experts_implementation", None)
|
|
329
|
+
|
|
323
330
|
# Drop the transformers version info
|
|
324
331
|
self.transformers_version = kwargs.pop("transformers_version", None)
|
|
325
332
|
|
|
@@ -413,6 +420,28 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
413
420
|
)
|
|
414
421
|
subconfig._attn_implementation = sub_implementation
|
|
415
422
|
|
|
423
|
+
@property
|
|
424
|
+
def _experts_implementation(self):
|
|
425
|
+
return self._experts_implementation_internal
|
|
426
|
+
|
|
427
|
+
@_experts_implementation.setter
|
|
428
|
+
def _experts_implementation(self, value: str | dict | None):
|
|
429
|
+
"""We set it recursively on the sub-configs as well"""
|
|
430
|
+
# Set if for current config
|
|
431
|
+
current_moe = getattr(self, "_experts_implementation", None)
|
|
432
|
+
experts_implementation = value if not isinstance(value, dict) else value.get("", current_moe)
|
|
433
|
+
self._experts_implementation_internal = experts_implementation
|
|
434
|
+
|
|
435
|
+
# Set it recursively on the subconfigs
|
|
436
|
+
for subconfig_key in self.sub_configs:
|
|
437
|
+
subconfig = getattr(self, subconfig_key, None)
|
|
438
|
+
if subconfig is not None:
|
|
439
|
+
current_subconfig_moe = getattr(subconfig, "_experts_implementation", None)
|
|
440
|
+
sub_implementation = (
|
|
441
|
+
value if not isinstance(value, dict) else value.get(subconfig_key, current_subconfig_moe)
|
|
442
|
+
)
|
|
443
|
+
subconfig._experts_implementation = sub_implementation
|
|
444
|
+
|
|
416
445
|
@property
|
|
417
446
|
def torch_dtype(self):
|
|
418
447
|
logger.warning_once("`torch_dtype` is deprecated! Use `dtype` instead!")
|
|
@@ -449,13 +478,11 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
449
478
|
if os.path.isfile(save_directory):
|
|
450
479
|
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
|
|
451
480
|
|
|
452
|
-
|
|
453
|
-
if len(
|
|
481
|
+
generation_parameters = self._get_generation_parameters()
|
|
482
|
+
if len(generation_parameters) > 0:
|
|
454
483
|
raise ValueError(
|
|
455
|
-
"Some
|
|
456
|
-
"
|
|
457
|
-
"(https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model)."
|
|
458
|
-
f"\nNon-default generation parameters: {str(non_default_generation_parameters)}",
|
|
484
|
+
"Some generation parameters are set in the model config. These should go into `model.generation_config`"
|
|
485
|
+
f"as opposed to `model.config`. \nGeneration parameters found: {str(generation_parameters)}",
|
|
459
486
|
)
|
|
460
487
|
|
|
461
488
|
os.makedirs(save_directory, exist_ok=True)
|
|
@@ -754,8 +781,9 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
754
781
|
# If both are present, use `dtype`
|
|
755
782
|
kwargs["dtype"] = kwargs.get("dtype", torch_dtype)
|
|
756
783
|
|
|
757
|
-
# We remove
|
|
784
|
+
# We remove them from kwargs so that they do not appear in `return_unused_kwargs`.
|
|
758
785
|
config_dict["attn_implementation"] = kwargs.pop("attn_implementation", None)
|
|
786
|
+
config_dict["experts_implementation"] = kwargs.pop("experts_implementation", None)
|
|
759
787
|
|
|
760
788
|
config = cls(**config_dict)
|
|
761
789
|
|
|
@@ -813,7 +841,56 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
813
841
|
def _dict_from_json_file(cls, json_file: str | os.PathLike):
|
|
814
842
|
with open(json_file, encoding="utf-8") as reader:
|
|
815
843
|
text = reader.read()
|
|
816
|
-
|
|
844
|
+
config_dict = json.loads(text)
|
|
845
|
+
|
|
846
|
+
return cls._decode_special_floats(config_dict)
|
|
847
|
+
|
|
848
|
+
@classmethod
|
|
849
|
+
def _encode_special_floats(cls, obj: Any) -> Any:
|
|
850
|
+
"""
|
|
851
|
+
Iterates over the passed object and encode specific floats that cannot be JSON-serialized. Python's JSON
|
|
852
|
+
engine saves floats like `Infinity` (+/-) or `NaN` which are not compatible with other JSON engines.
|
|
853
|
+
|
|
854
|
+
It serializes floats like `Infinity` as an object: `{'__float__': Infinity}`.
|
|
855
|
+
"""
|
|
856
|
+
if isinstance(obj, float):
|
|
857
|
+
if math.isnan(obj):
|
|
858
|
+
return {_FLOAT_TAG_KEY: "NaN"}
|
|
859
|
+
if obj == float("inf"):
|
|
860
|
+
return {_FLOAT_TAG_KEY: "Infinity"}
|
|
861
|
+
if obj == float("-inf"):
|
|
862
|
+
return {_FLOAT_TAG_KEY: "-Infinity"}
|
|
863
|
+
return obj
|
|
864
|
+
|
|
865
|
+
if isinstance(obj, dict):
|
|
866
|
+
return {k: cls._encode_special_floats(v) for k, v in obj.items()}
|
|
867
|
+
|
|
868
|
+
if isinstance(obj, (list, tuple)):
|
|
869
|
+
return [cls._encode_special_floats(v) for v in obj]
|
|
870
|
+
|
|
871
|
+
return obj
|
|
872
|
+
|
|
873
|
+
@classmethod
|
|
874
|
+
def _decode_special_floats(cls, obj: Any) -> Any:
|
|
875
|
+
"""
|
|
876
|
+
Iterates over the passed object and decode specific floats that cannot be JSON-serialized. Python's JSON
|
|
877
|
+
engine saves floats like `Infinity` (+/-) or `NaN` which are not compatible with other JSON engines.
|
|
878
|
+
|
|
879
|
+
This method deserializes objects like `{'__float__': Infinity}` to their float values like `Infinity`.
|
|
880
|
+
"""
|
|
881
|
+
if isinstance(obj, dict):
|
|
882
|
+
if set(obj.keys()) == {_FLOAT_TAG_KEY} and isinstance(obj[_FLOAT_TAG_KEY], str):
|
|
883
|
+
tag = obj[_FLOAT_TAG_KEY]
|
|
884
|
+
if tag in _FLOAT_TAG_VALUES:
|
|
885
|
+
return _FLOAT_TAG_VALUES[tag]
|
|
886
|
+
return obj
|
|
887
|
+
|
|
888
|
+
return {k: cls._decode_special_floats(v) for k, v in obj.items()}
|
|
889
|
+
|
|
890
|
+
if isinstance(obj, list):
|
|
891
|
+
return [cls._decode_special_floats(v) for v in obj]
|
|
892
|
+
|
|
893
|
+
return obj
|
|
817
894
|
|
|
818
895
|
def __eq__(self, other):
|
|
819
896
|
return isinstance(other, PreTrainedConfig) and (self.__dict__ == other.__dict__)
|
|
@@ -933,6 +1010,10 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
933
1010
|
config_dict = self.to_diff_dict()
|
|
934
1011
|
else:
|
|
935
1012
|
config_dict = self.to_dict()
|
|
1013
|
+
|
|
1014
|
+
# Handle +/-Infinity and NaNs
|
|
1015
|
+
config_dict = self._encode_special_floats(config_dict)
|
|
1016
|
+
|
|
936
1017
|
return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
|
|
937
1018
|
|
|
938
1019
|
def to_json_file(self, json_file_path: str | os.PathLike, use_diff: bool = True):
|
|
@@ -1019,10 +1100,6 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
1019
1100
|
Checks and removes if there are any keys in the dict that should not be serialized when saving the config.
|
|
1020
1101
|
Runs recursive check on the dict, to remove from all sub configs.
|
|
1021
1102
|
"""
|
|
1022
|
-
if hasattr(self, "quantization_config"):
|
|
1023
|
-
# Pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
|
|
1024
|
-
_ = d.pop("_pre_quantization_dtype", None)
|
|
1025
|
-
|
|
1026
1103
|
if "_auto_class" in d:
|
|
1027
1104
|
del d["_auto_class"]
|
|
1028
1105
|
if "_output_attentions" in d:
|
|
@@ -1031,6 +1108,8 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
1031
1108
|
del d["_commit_hash"]
|
|
1032
1109
|
if "_attn_implementation_internal" in d:
|
|
1033
1110
|
del d["_attn_implementation_internal"]
|
|
1111
|
+
if "_experts_implementation_internal" in d:
|
|
1112
|
+
del d["_experts_implementation_internal"]
|
|
1034
1113
|
# Do not serialize `base_model_tp_plan` for now
|
|
1035
1114
|
if "base_model_tp_plan" in d:
|
|
1036
1115
|
del d["base_model_tp_plan"]
|
|
@@ -1063,58 +1142,17 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
1063
1142
|
|
|
1064
1143
|
cls._auto_class = auto_class
|
|
1065
1144
|
|
|
1066
|
-
|
|
1067
|
-
def _get_global_generation_defaults() -> dict[str, Any]:
|
|
1068
|
-
return {
|
|
1069
|
-
"max_length": 20,
|
|
1070
|
-
"min_length": 0,
|
|
1071
|
-
"do_sample": False,
|
|
1072
|
-
"early_stopping": False,
|
|
1073
|
-
"num_beams": 1,
|
|
1074
|
-
"temperature": 1.0,
|
|
1075
|
-
"top_k": 50,
|
|
1076
|
-
"top_p": 1.0,
|
|
1077
|
-
"typical_p": 1.0,
|
|
1078
|
-
"repetition_penalty": 1.0,
|
|
1079
|
-
"length_penalty": 1.0,
|
|
1080
|
-
"no_repeat_ngram_size": 0,
|
|
1081
|
-
"encoder_no_repeat_ngram_size": 0,
|
|
1082
|
-
"bad_words_ids": None,
|
|
1083
|
-
"num_return_sequences": 1,
|
|
1084
|
-
"output_scores": False,
|
|
1085
|
-
"return_dict_in_generate": False,
|
|
1086
|
-
"forced_bos_token_id": None,
|
|
1087
|
-
"forced_eos_token_id": None,
|
|
1088
|
-
"remove_invalid_values": False,
|
|
1089
|
-
"exponential_decay_length_penalty": None,
|
|
1090
|
-
"suppress_tokens": None,
|
|
1091
|
-
"begin_suppress_tokens": None,
|
|
1092
|
-
# Deprecated arguments (moved to the Hub). TODO joao, manuel: remove in v4.62.0
|
|
1093
|
-
"num_beam_groups": 1,
|
|
1094
|
-
"diversity_penalty": 0.0,
|
|
1095
|
-
}
|
|
1096
|
-
|
|
1097
|
-
def _get_non_default_generation_parameters(self) -> dict[str, Any]:
|
|
1145
|
+
def _get_generation_parameters(self) -> dict[str, Any]:
|
|
1098
1146
|
"""
|
|
1099
1147
|
Gets the non-default generation parameters on the PreTrainedConfig instance
|
|
1100
1148
|
"""
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
for parameter_name, default_global_value in self._get_global_generation_defaults().items():
|
|
1108
|
-
if hasattr(self_decoder_config, parameter_name):
|
|
1109
|
-
parameter_value = getattr(self_decoder_config, parameter_name, None)
|
|
1110
|
-
# Two cases in which is okay for the model config to hold generation config parameters:
|
|
1111
|
-
# 1. The parameter is set to `None`, effectively delegating its value to the generation config
|
|
1112
|
-
# 2. The parameter is set the global generation defaults
|
|
1113
|
-
if parameter_value is None or parameter_value == default_global_value:
|
|
1114
|
-
continue
|
|
1115
|
-
non_default_generation_parameters[parameter_name] = getattr(self_decoder_config, parameter_name)
|
|
1149
|
+
generation_params = {}
|
|
1150
|
+
default_config = self.__class__().to_dict() if not self.has_no_defaults_at_init else {}
|
|
1151
|
+
for key in GenerationConfig._get_default_generation_params().keys():
|
|
1152
|
+
if hasattr(self, key) and getattr(self, key) is not None and key not in default_config:
|
|
1153
|
+
generation_params[key] = getattr(self, key)
|
|
1116
1154
|
|
|
1117
|
-
return
|
|
1155
|
+
return generation_params
|
|
1118
1156
|
|
|
1119
1157
|
def get_text_config(self, decoder=None, encoder=None) -> "PreTrainedConfig":
|
|
1120
1158
|
"""
|
|
@@ -1255,18 +1293,24 @@ if PreTrainedConfig.push_to_hub.__doc__ is not None:
|
|
|
1255
1293
|
PretrainedConfig = PreTrainedConfig
|
|
1256
1294
|
|
|
1257
1295
|
|
|
1258
|
-
|
|
1296
|
+
ALLOWED_ATTENTION_LAYER_TYPES = (
|
|
1259
1297
|
"full_attention",
|
|
1260
1298
|
"sliding_attention",
|
|
1261
1299
|
"chunked_attention",
|
|
1262
1300
|
"linear_attention", # used in minimax
|
|
1263
1301
|
)
|
|
1264
1302
|
|
|
1303
|
+
ALLOWED_MLP_LAYER_TYPES = (
|
|
1304
|
+
"sparse",
|
|
1305
|
+
"dense",
|
|
1306
|
+
)
|
|
1307
|
+
|
|
1265
1308
|
|
|
1266
|
-
def layer_type_validation(layer_types: list[str], num_hidden_layers: Optional[int] = None):
|
|
1309
|
+
def layer_type_validation(layer_types: list[str], num_hidden_layers: Optional[int] = None, attention: bool = True):
|
|
1267
1310
|
"""Check that `layer_types` is correctly defined."""
|
|
1268
|
-
|
|
1269
|
-
|
|
1311
|
+
allowed_layer_types = ALLOWED_ATTENTION_LAYER_TYPES if attention else ALLOWED_MLP_LAYER_TYPES
|
|
1312
|
+
if not all(layer_type in allowed_layer_types for layer_type in layer_types):
|
|
1313
|
+
raise ValueError(f"The `layer_types` entries must be in {allowed_layer_types}")
|
|
1270
1314
|
if num_hidden_layers is not None and num_hidden_layers != len(layer_types):
|
|
1271
1315
|
raise ValueError(
|
|
1272
1316
|
f"`num_hidden_layers` ({num_hidden_layers}) must be equal to the number of layer types "
|
|
@@ -18,7 +18,15 @@ from __future__ import annotations
|
|
|
18
18
|
from copy import deepcopy
|
|
19
19
|
from typing import TYPE_CHECKING
|
|
20
20
|
|
|
21
|
-
from .core_model_loading import
|
|
21
|
+
from .core_model_loading import (
|
|
22
|
+
Chunk,
|
|
23
|
+
Concatenate,
|
|
24
|
+
ErnieFuseAndSplitTextVisionExperts,
|
|
25
|
+
MergeModulelist,
|
|
26
|
+
Transpose,
|
|
27
|
+
WeightConverter,
|
|
28
|
+
WeightRenaming,
|
|
29
|
+
)
|
|
22
30
|
from .utils import is_torch_available
|
|
23
31
|
|
|
24
32
|
|
|
@@ -105,6 +113,57 @@ def _build_checkpoint_conversion_mapping():
|
|
|
105
113
|
operations=[MergeModulelist(dim=0)],
|
|
106
114
|
),
|
|
107
115
|
],
|
|
116
|
+
"ernie4_5_vl_moe": [
|
|
117
|
+
# vision
|
|
118
|
+
WeightRenaming("vision_model", "vision_tower"),
|
|
119
|
+
# resampler
|
|
120
|
+
WeightRenaming("spatial_linear.0", "spatial_linear.fc1"),
|
|
121
|
+
WeightRenaming("spatial_linear.2", "spatial_linear.fc2"),
|
|
122
|
+
WeightRenaming("spatial_linear.3", "spatial_linear.ln"),
|
|
123
|
+
WeightRenaming("temporal_linear.0", "temporal_linear.fc1"),
|
|
124
|
+
WeightRenaming("temporal_linear.2", "temporal_linear.fc2"),
|
|
125
|
+
WeightRenaming("temporal_linear.3", "temporal_linear.ln"),
|
|
126
|
+
# language model
|
|
127
|
+
WeightRenaming(r"(?<!language_model\.)embed_tokens", "language_model.embed_tokens"),
|
|
128
|
+
WeightRenaming(r"(?<!language_model\.)layers", "language_model.layers"),
|
|
129
|
+
WeightConverter(
|
|
130
|
+
source_patterns="mlp.gate.weight_1",
|
|
131
|
+
target_patterns="mlp.vision_moe.gate.weight",
|
|
132
|
+
operations=[Transpose(dim0=0, dim1=1)],
|
|
133
|
+
),
|
|
134
|
+
WeightConverter(
|
|
135
|
+
source_patterns="mlp.gate.weight",
|
|
136
|
+
target_patterns="mlp.text_moe.gate.weight",
|
|
137
|
+
operations=[Transpose(dim0=0, dim1=1)],
|
|
138
|
+
),
|
|
139
|
+
WeightConverter(
|
|
140
|
+
source_patterns=["mlp.moe_statics.e_score_correction_bias"],
|
|
141
|
+
target_patterns=[
|
|
142
|
+
"mlp.text_moe.gate.moe_statics.e_score_correction_bias",
|
|
143
|
+
"mlp.vision_moe.gate.moe_statics.e_score_correction_bias",
|
|
144
|
+
],
|
|
145
|
+
operations=[Chunk(dim=0)],
|
|
146
|
+
),
|
|
147
|
+
WeightConverter(
|
|
148
|
+
source_patterns=["experts.*.down_proj.weight"],
|
|
149
|
+
target_patterns=[
|
|
150
|
+
"text_moe.experts.down_proj",
|
|
151
|
+
"vision_moe.experts.down_proj",
|
|
152
|
+
],
|
|
153
|
+
operations=[ErnieFuseAndSplitTextVisionExperts(stack_dim=0, concat_dim=1)],
|
|
154
|
+
),
|
|
155
|
+
WeightConverter(
|
|
156
|
+
source_patterns=[
|
|
157
|
+
"experts.*.gate_proj.weight",
|
|
158
|
+
"experts.*.up_proj.weight",
|
|
159
|
+
],
|
|
160
|
+
target_patterns=[
|
|
161
|
+
"text_moe.experts.gate_up_proj",
|
|
162
|
+
"vision_moe.experts.gate_up_proj",
|
|
163
|
+
],
|
|
164
|
+
operations=[ErnieFuseAndSplitTextVisionExperts(stack_dim=0, concat_dim=1)],
|
|
165
|
+
),
|
|
166
|
+
],
|
|
108
167
|
"jamba": [
|
|
109
168
|
WeightConverter(
|
|
110
169
|
source_patterns=[
|
|
@@ -142,12 +201,12 @@ def _build_checkpoint_conversion_mapping():
|
|
|
142
201
|
if hasattr(torch.nn.utils.parametrizations, "weight_norm"):
|
|
143
202
|
mapping["legacy"] += [
|
|
144
203
|
WeightRenaming(
|
|
145
|
-
source_patterns="weight_g",
|
|
146
|
-
target_patterns="parametrizations.weight.original0",
|
|
204
|
+
source_patterns=".weight_g$",
|
|
205
|
+
target_patterns=".parametrizations.weight.original0",
|
|
147
206
|
),
|
|
148
207
|
WeightRenaming(
|
|
149
|
-
source_patterns="weight_v",
|
|
150
|
-
target_patterns="parametrizations.weight.original1",
|
|
208
|
+
source_patterns=".weight_v$",
|
|
209
|
+
target_patterns=".parametrizations.weight.original1",
|
|
151
210
|
),
|
|
152
211
|
]
|
|
153
212
|
else:
|
|
@@ -166,6 +225,9 @@ def _build_checkpoint_conversion_mapping():
|
|
|
166
225
|
mapping["deepseek_v3"] = mapping["qwen2_moe"].copy()
|
|
167
226
|
mapping["dots1"] = mapping["qwen2_moe"].copy()
|
|
168
227
|
mapping["ernie4_5_moe"] = mapping["qwen2_moe"].copy()
|
|
228
|
+
mapping["ernie4_5_moe"] += [
|
|
229
|
+
WeightRenaming("mlp.moe_statics.e_score_correction_bias", "mlp.gate.moe_statics.e_score_correction_bias")
|
|
230
|
+
]
|
|
169
231
|
mapping["glm4_moe"] = mapping["qwen2_moe"].copy()
|
|
170
232
|
mapping["glm4v_moe"] = mapping["qwen2_moe"].copy()
|
|
171
233
|
mapping["longcat_flash"] = mapping["qwen2_moe"].copy()
|
|
@@ -226,6 +288,7 @@ VLMS = [
|
|
|
226
288
|
"sam3_tracker",
|
|
227
289
|
"sam3_tracker_video",
|
|
228
290
|
"paddleocrvl",
|
|
291
|
+
"ernie4_5_vl_moe",
|
|
229
292
|
]
|
|
230
293
|
|
|
231
294
|
|