transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +20 -1
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +68 -5
- transformers/core_model_loading.py +201 -35
- transformers/dependency_versions_table.py +1 -1
- transformers/feature_extraction_utils.py +54 -22
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +162 -122
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +101 -64
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +2 -12
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +12 -0
- transformers/integrations/accelerate.py +44 -111
- transformers/integrations/aqlm.py +3 -5
- transformers/integrations/awq.py +2 -5
- transformers/integrations/bitnet.py +5 -8
- transformers/integrations/bitsandbytes.py +16 -15
- transformers/integrations/deepspeed.py +18 -3
- transformers/integrations/eetq.py +3 -5
- transformers/integrations/fbgemm_fp8.py +1 -1
- transformers/integrations/finegrained_fp8.py +6 -16
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/higgs.py +2 -5
- transformers/integrations/hub_kernels.py +23 -5
- transformers/integrations/integration_utils.py +35 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +4 -10
- transformers/integrations/peft.py +5 -0
- transformers/integrations/quanto.py +5 -2
- transformers/integrations/spqr.py +3 -5
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/vptq.py +3 -5
- transformers/modeling_gguf_pytorch_utils.py +66 -19
- transformers/modeling_rope_utils.py +78 -81
- transformers/modeling_utils.py +583 -503
- transformers/models/__init__.py +19 -0
- transformers/models/afmoe/modeling_afmoe.py +7 -16
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/align/modeling_align.py +12 -6
- transformers/models/altclip/modeling_altclip.py +7 -3
- transformers/models/apertus/modeling_apertus.py +4 -2
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +8 -4
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +27 -0
- transformers/models/auto/feature_extraction_auto.py +7 -3
- transformers/models/auto/image_processing_auto.py +4 -2
- transformers/models/auto/modeling_auto.py +31 -0
- transformers/models/auto/processing_auto.py +4 -0
- transformers/models/auto/tokenization_auto.py +132 -153
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +18 -19
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +9 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +3 -0
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
- transformers/models/bit/modeling_bit.py +5 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blenderbot/modeling_blenderbot.py +7 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +8 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -0
- transformers/models/bloom/modeling_bloom.py +13 -44
- transformers/models/blt/modeling_blt.py +162 -2
- transformers/models/blt/modular_blt.py +168 -3
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +6 -0
- transformers/models/bros/modeling_bros.py +8 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/canine/modeling_canine.py +6 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +9 -4
- transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +25 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clipseg/modeling_clipseg.py +4 -0
- transformers/models/clvp/modeling_clvp.py +14 -3
- transformers/models/code_llama/tokenization_code_llama.py +1 -1
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
- transformers/models/convbert/modeling_convbert.py +3 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +3 -1
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +14 -2
- transformers/models/cvt/modeling_cvt.py +5 -1
- transformers/models/cwm/modeling_cwm.py +1 -1
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +46 -39
- transformers/models/d_fine/modular_d_fine.py +15 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +1 -1
- transformers/models/dac/modeling_dac.py +4 -4
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +1 -1
- transformers/models/deberta/modeling_deberta.py +2 -0
- transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
- transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
- transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +8 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +12 -1
- transformers/models/dia/modular_dia.py +11 -0
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +3 -3
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
- transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +16 -12
- transformers/models/dots1/modeling_dots1.py +14 -5
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +5 -2
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +5 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +8 -2
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt_fast.py +46 -14
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +6 -1
- transformers/models/evolla/modeling_evolla.py +9 -1
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/falcon/modeling_falcon.py +3 -3
- transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
- transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
- transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +14 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +4 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
- transformers/models/florence2/modeling_florence2.py +20 -3
- transformers/models/florence2/modular_florence2.py +13 -0
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +16 -0
- transformers/models/gemma/modeling_gemma.py +10 -12
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma2/modular_gemma2.py +1 -1
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +28 -7
- transformers/models/gemma3/modular_gemma3.py +26 -6
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +47 -9
- transformers/models/gemma3n/modular_gemma3n.py +51 -9
- transformers/models/git/modeling_git.py +181 -126
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +15 -5
- transformers/models/glm4v/modular_glm4v.py +11 -3
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
- transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +8 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
- transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
- transformers/models/gptj/modeling_gptj.py +15 -6
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +2 -3
- transformers/models/granitemoe/modular_granitemoe.py +1 -2
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
- transformers/models/groupvit/modeling_groupvit.py +6 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
- transformers/models/hubert/modeling_hubert.py +4 -0
- transformers/models/hubert/modular_hubert.py +4 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +16 -0
- transformers/models/idefics/modeling_idefics.py +10 -0
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +9 -2
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +11 -8
- transformers/models/internvl/modular_internvl.py +5 -9
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +24 -19
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +15 -7
- transformers/models/janus/modular_janus.py +16 -7
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +14 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/configuration_lasr.py +4 -0
- transformers/models/lasr/modeling_lasr.py +3 -2
- transformers/models/lasr/modular_lasr.py +8 -1
- transformers/models/lasr/processing_lasr.py +0 -2
- transformers/models/layoutlm/modeling_layoutlm.py +5 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +18 -0
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lilt/modeling_lilt.py +19 -15
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +8 -4
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
- transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
- transformers/models/longt5/modeling_longt5.py +0 -4
- transformers/models/m2m_100/modeling_m2m_100.py +10 -0
- transformers/models/mamba/modeling_mamba.py +2 -1
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +3 -0
- transformers/models/markuplm/modeling_markuplm.py +5 -8
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +9 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +9 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mimi/modeling_mimi.py +25 -4
- transformers/models/minimax/modeling_minimax.py +16 -3
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +12 -4
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +13 -2
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +4 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
- transformers/models/modernbert/modeling_modernbert.py +12 -1
- transformers/models/modernbert/modular_modernbert.py +12 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
- transformers/models/moonshine/modeling_moonshine.py +1 -1
- transformers/models/moshi/modeling_moshi.py +21 -51
- transformers/models/mpnet/modeling_mpnet.py +2 -0
- transformers/models/mra/modeling_mra.py +4 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +0 -10
- transformers/models/musicgen/modeling_musicgen.py +5 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +1 -1
- transformers/models/nemotron/modeling_nemotron.py +3 -3
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +11 -16
- transformers/models/nystromformer/modeling_nystromformer.py +7 -0
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/olmoe/modeling_olmoe.py +12 -4
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +7 -38
- transformers/models/openai/modeling_openai.py +12 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +7 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +7 -3
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/modeling_parakeet.py +5 -0
- transformers/models/parakeet/modular_parakeet.py +5 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
- transformers/models/patchtst/modeling_patchtst.py +5 -4
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/models/pe_audio/processing_pe_audio.py +24 -0
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +3 -0
- transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +5 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +12 -4
- transformers/models/phimoe/modular_phimoe.py +1 -1
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +1 -1
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +7 -0
- transformers/models/plbart/modular_plbart.py +6 -0
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +11 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prophetnet/modeling_prophetnet.py +2 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
- transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +7 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
- transformers/models/reformer/modeling_reformer.py +9 -1
- transformers/models/regnet/modeling_regnet.py +4 -0
- transformers/models/rembert/modeling_rembert.py +7 -1
- transformers/models/resnet/modeling_resnet.py +8 -3
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +4 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +1 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +5 -1
- transformers/models/sam2/modular_sam2.py +5 -1
- transformers/models/sam2_video/modeling_sam2_video.py +51 -43
- transformers/models/sam2_video/modular_sam2_video.py +31 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +23 -0
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +3 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +2 -2
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +63 -41
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
- transformers/models/speecht5/modeling_speecht5.py +28 -0
- transformers/models/splinter/modeling_splinter.py +9 -3
- transformers/models/squeezebert/modeling_squeezebert.py +2 -0
- transformers/models/stablelm/modeling_stablelm.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/swiftformer/modeling_swiftformer.py +4 -0
- transformers/models/swin/modeling_swin.py +16 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +49 -33
- transformers/models/swinv2/modeling_swinv2.py +41 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +1 -7
- transformers/models/t5gemma/modeling_t5gemma.py +1 -1
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +1 -1
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +12 -0
- transformers/models/timesfm/modular_timesfm.py +12 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
- transformers/models/trocr/modeling_trocr.py +1 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +4 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +3 -7
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +0 -6
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +7 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/visual_bert/modeling_visual_bert.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +4 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +5 -3
- transformers/models/x_clip/modeling_x_clip.py +2 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +10 -0
- transformers/models/xlm/modeling_xlm.py +13 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +4 -1
- transformers/models/zamba/modeling_zamba.py +2 -1
- transformers/models/zamba2/modeling_zamba2.py +3 -2
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +7 -0
- transformers/pipelines/__init__.py +9 -6
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/document_question_answering.py +1 -1
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +127 -56
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +9 -64
- transformers/quantizers/quantizer_aqlm.py +1 -18
- transformers/quantizers/quantizer_auto_round.py +1 -10
- transformers/quantizers/quantizer_awq.py +3 -8
- transformers/quantizers/quantizer_bitnet.py +1 -6
- transformers/quantizers/quantizer_bnb_4bit.py +9 -49
- transformers/quantizers/quantizer_bnb_8bit.py +9 -19
- transformers/quantizers/quantizer_compressed_tensors.py +1 -4
- transformers/quantizers/quantizer_eetq.py +2 -12
- transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
- transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
- transformers/quantizers/quantizer_fp_quant.py +4 -4
- transformers/quantizers/quantizer_gptq.py +1 -4
- transformers/quantizers/quantizer_higgs.py +2 -6
- transformers/quantizers/quantizer_mxfp4.py +2 -28
- transformers/quantizers/quantizer_quanto.py +14 -14
- transformers/quantizers/quantizer_spqr.py +3 -8
- transformers/quantizers/quantizer_torchao.py +28 -124
- transformers/quantizers/quantizer_vptq.py +1 -10
- transformers/testing_utils.py +28 -12
- transformers/tokenization_mistral_common.py +3 -2
- transformers/tokenization_utils_base.py +3 -2
- transformers/tokenization_utils_tokenizers.py +25 -2
- transformers/trainer.py +24 -2
- transformers/trainer_callback.py +8 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/training_args.py +8 -10
- transformers/utils/__init__.py +4 -0
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +34 -25
- transformers/utils/generic.py +20 -0
- transformers/utils/import_utils.py +51 -9
- transformers/utils/kernel_config.py +71 -18
- transformers/utils/quantization_config.py +8 -8
- transformers/video_processing_utils.py +16 -12
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -28,6 +28,7 @@ import torch.nn as nn
|
|
|
28
28
|
import torch.nn.functional as F
|
|
29
29
|
from torch.nn import LayerNorm
|
|
30
30
|
|
|
31
|
+
from ... import initialization as init
|
|
31
32
|
from ...activations import ACT2FN
|
|
32
33
|
from ...cache_utils import Cache, DynamicCache
|
|
33
34
|
from ...generation import GenerationMixin
|
|
@@ -104,6 +105,8 @@ class Glm4vVisionRotaryEmbedding(nn.Module):
|
|
|
104
105
|
|
|
105
106
|
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
106
107
|
super().__init__()
|
|
108
|
+
self.dim = dim
|
|
109
|
+
self.theta = theta
|
|
107
110
|
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
108
111
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
109
112
|
|
|
@@ -141,7 +144,6 @@ class Glm4vVisionEmbeddings(nn.Module):
|
|
|
141
144
|
self.num_patches = (self.image_size // self.patch_size) ** 2
|
|
142
145
|
self.num_positions = self.num_patches
|
|
143
146
|
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
|
144
|
-
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
|
145
147
|
|
|
146
148
|
def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
|
|
147
149
|
"""
|
|
@@ -313,8 +315,8 @@ class Glm4vVisionAttention(nn.Module):
|
|
|
313
315
|
if self.config._attn_implementation != "eager":
|
|
314
316
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
315
317
|
|
|
316
|
-
if self.config._attn_implementation
|
|
317
|
-
# Flash Attention
|
|
318
|
+
if "flash" in self.config._attn_implementation:
|
|
319
|
+
# Flash Attention: Use cu_seqlens for variable length attention
|
|
318
320
|
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
|
|
319
321
|
attn_output, _ = attention_interface(
|
|
320
322
|
self,
|
|
@@ -403,7 +405,7 @@ class Glm4vTextRotaryEmbedding(nn.Module):
|
|
|
403
405
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
404
406
|
|
|
405
407
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
406
|
-
self.original_inv_freq =
|
|
408
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
407
409
|
|
|
408
410
|
@staticmethod
|
|
409
411
|
def compute_default_rope_parameters(
|
|
@@ -705,6 +707,12 @@ class Glm4vPreTrainedModel(PreTrainedModel):
|
|
|
705
707
|
"attentions": Glm4vTextAttention,
|
|
706
708
|
}
|
|
707
709
|
|
|
710
|
+
def _init_weights(self, module):
|
|
711
|
+
super()._init_weights(module)
|
|
712
|
+
if isinstance(module, Glm4vVisionRotaryEmbedding):
|
|
713
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
714
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
715
|
+
|
|
708
716
|
|
|
709
717
|
class Glm4vVisionModel(Glm4vPreTrainedModel):
|
|
710
718
|
config: Glm4vVisionConfig
|
|
@@ -1487,6 +1495,7 @@ class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
|
|
|
1487
1495
|
pixel_values_videos=None,
|
|
1488
1496
|
image_grid_thw=None,
|
|
1489
1497
|
video_grid_thw=None,
|
|
1498
|
+
is_first_iteration=False,
|
|
1490
1499
|
**kwargs,
|
|
1491
1500
|
):
|
|
1492
1501
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1503,13 +1512,14 @@ class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
|
|
|
1503
1512
|
image_grid_thw=image_grid_thw,
|
|
1504
1513
|
video_grid_thw=video_grid_thw,
|
|
1505
1514
|
use_cache=use_cache,
|
|
1515
|
+
is_first_iteration=is_first_iteration,
|
|
1506
1516
|
**kwargs,
|
|
1507
1517
|
)
|
|
1508
1518
|
|
|
1509
1519
|
# GLM-4.1V position_ids are prepareed with rope_deltas in forward
|
|
1510
1520
|
model_inputs["position_ids"] = None
|
|
1511
1521
|
|
|
1512
|
-
if
|
|
1522
|
+
if not is_first_iteration and use_cache:
|
|
1513
1523
|
model_inputs["pixel_values"] = None
|
|
1514
1524
|
model_inputs["pixel_values_videos"] = None
|
|
1515
1525
|
|
|
@@ -22,6 +22,7 @@ import torch.nn as nn
|
|
|
22
22
|
import torch.nn.functional as F
|
|
23
23
|
from torch.nn import LayerNorm
|
|
24
24
|
|
|
25
|
+
from ... import initialization as init
|
|
25
26
|
from ...activations import ACT2FN
|
|
26
27
|
from ...cache_utils import Cache, DynamicCache
|
|
27
28
|
from ...configuration_utils import PreTrainedConfig
|
|
@@ -32,7 +33,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
|
32
33
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
33
34
|
from ...modeling_outputs import BaseModelOutputWithPast
|
|
34
35
|
from ...modeling_rope_utils import RopeParameters
|
|
35
|
-
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
|
36
|
+
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
36
37
|
from ...processing_utils import Unpack
|
|
37
38
|
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
|
38
39
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
|
|
@@ -409,7 +410,6 @@ class Glm4vVisionEmbeddings(nn.Module):
|
|
|
409
410
|
self.num_patches = (self.image_size // self.patch_size) ** 2
|
|
410
411
|
self.num_positions = self.num_patches
|
|
411
412
|
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
|
412
|
-
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
|
413
413
|
|
|
414
414
|
def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
|
|
415
415
|
"""
|
|
@@ -725,6 +725,12 @@ class Glm4vPreTrainedModel(Qwen2_5_VLPreTrainedModel):
|
|
|
725
725
|
"attentions": Glm4vTextAttention,
|
|
726
726
|
}
|
|
727
727
|
|
|
728
|
+
def _init_weights(self, module):
|
|
729
|
+
PreTrainedModel._init_weights(self, module)
|
|
730
|
+
if isinstance(module, Glm4vVisionRotaryEmbedding):
|
|
731
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
732
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
733
|
+
|
|
728
734
|
|
|
729
735
|
class Glm4vVisionModel(Glm4vPreTrainedModel):
|
|
730
736
|
config: Glm4vVisionConfig
|
|
@@ -1414,6 +1420,7 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
|
|
1414
1420
|
pixel_values_videos=None,
|
|
1415
1421
|
image_grid_thw=None,
|
|
1416
1422
|
video_grid_thw=None,
|
|
1423
|
+
is_first_iteration=False,
|
|
1417
1424
|
**kwargs,
|
|
1418
1425
|
):
|
|
1419
1426
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1430,13 +1437,14 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
|
|
1430
1437
|
image_grid_thw=image_grid_thw,
|
|
1431
1438
|
video_grid_thw=video_grid_thw,
|
|
1432
1439
|
use_cache=use_cache,
|
|
1440
|
+
is_first_iteration=is_first_iteration,
|
|
1433
1441
|
**kwargs,
|
|
1434
1442
|
)
|
|
1435
1443
|
|
|
1436
1444
|
# GLM-4.1V position_ids are prepareed with rope_deltas in forward
|
|
1437
1445
|
model_inputs["position_ids"] = None
|
|
1438
1446
|
|
|
1439
|
-
if
|
|
1447
|
+
if not is_first_iteration and use_cache:
|
|
1440
1448
|
model_inputs["pixel_values"] = None
|
|
1441
1449
|
model_inputs["pixel_values_videos"] = None
|
|
1442
1450
|
|
|
@@ -32,7 +32,7 @@ from ... import initialization as init
|
|
|
32
32
|
from ...activations import ACT2FN
|
|
33
33
|
from ...cache_utils import Cache, DynamicCache
|
|
34
34
|
from ...generation import GenerationMixin
|
|
35
|
-
from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
|
|
35
|
+
from ...integrations import use_experts_implementation, use_kernel_forward_from_hub, use_kernelized_func
|
|
36
36
|
from ...masking_utils import create_causal_mask
|
|
37
37
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
38
38
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
@@ -40,7 +40,13 @@ from ...modeling_outputs import ModelOutput, MoeModelOutputWithPast
|
|
|
40
40
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
41
41
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
42
42
|
from ...processing_utils import Unpack
|
|
43
|
-
from ...utils import
|
|
43
|
+
from ...utils import (
|
|
44
|
+
TransformersKwargs,
|
|
45
|
+
auto_docstring,
|
|
46
|
+
can_return_tuple,
|
|
47
|
+
is_grouped_mm_available,
|
|
48
|
+
is_torchdynamo_compiling,
|
|
49
|
+
)
|
|
44
50
|
from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
|
|
45
51
|
from .configuration_glm4v_moe import Glm4vMoeConfig, Glm4vMoeTextConfig, Glm4vMoeVisionConfig
|
|
46
52
|
|
|
@@ -107,7 +113,7 @@ class Glm4vMoeTextRotaryEmbedding(nn.Module):
|
|
|
107
113
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
108
114
|
|
|
109
115
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
110
|
-
self.original_inv_freq =
|
|
116
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
111
117
|
|
|
112
118
|
@staticmethod
|
|
113
119
|
def compute_default_rope_parameters(
|
|
@@ -395,6 +401,7 @@ class Glm4vMoeTextTopkRouter(nn.Module):
|
|
|
395
401
|
return router_logits
|
|
396
402
|
|
|
397
403
|
|
|
404
|
+
@use_experts_implementation
|
|
398
405
|
class Glm4vMoeTextNaiveMoe(nn.Module):
|
|
399
406
|
"""Collection of expert weights stored as 3D tensors."""
|
|
400
407
|
|
|
@@ -402,7 +409,7 @@ class Glm4vMoeTextNaiveMoe(nn.Module):
|
|
|
402
409
|
super().__init__()
|
|
403
410
|
self.num_experts = config.num_local_experts
|
|
404
411
|
self.hidden_dim = config.hidden_size
|
|
405
|
-
self.intermediate_dim = config.
|
|
412
|
+
self.intermediate_dim = config.moe_intermediate_size
|
|
406
413
|
self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, 2 * self.intermediate_dim, self.hidden_dim))
|
|
407
414
|
self.down_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, self.intermediate_dim))
|
|
408
415
|
self.act_fn = ACT2FN[config.hidden_act]
|
|
@@ -586,7 +593,9 @@ class Glm4vMoePreTrainedModel(PreTrainedModel):
|
|
|
586
593
|
_supports_flash_attn = True
|
|
587
594
|
_supports_sdpa = True
|
|
588
595
|
_supports_flex_attn = True
|
|
589
|
-
_can_compile_fullgraph =
|
|
596
|
+
_can_compile_fullgraph = (
|
|
597
|
+
is_grouped_mm_available()
|
|
598
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
590
599
|
_supports_attention_backend = True
|
|
591
600
|
|
|
592
601
|
_can_record_outputs = {
|
|
@@ -602,9 +611,13 @@ class Glm4vMoePreTrainedModel(PreTrainedModel):
|
|
|
602
611
|
super()._init_weights(module)
|
|
603
612
|
if isinstance(module, Glm4vMoeTextTopkRouter):
|
|
604
613
|
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
|
|
614
|
+
init.zeros_(module.e_score_correction_bias)
|
|
605
615
|
elif isinstance(module, Glm4vMoeTextNaiveMoe):
|
|
606
616
|
init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
|
|
607
617
|
init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
|
|
618
|
+
if isinstance(module, Glm4vMoeVisionRotaryEmbedding):
|
|
619
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
620
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
608
621
|
|
|
609
622
|
|
|
610
623
|
@dataclass
|
|
@@ -637,6 +650,22 @@ class Glm4vMoeCausalLMOutputWithPast(ModelOutput):
|
|
|
637
650
|
aux_loss: Optional[torch.FloatTensor] = None
|
|
638
651
|
|
|
639
652
|
|
|
653
|
+
class Glm4vMoeVisionRotaryEmbedding(nn.Module):
|
|
654
|
+
inv_freq: torch.Tensor # fix linting for `register_buffer`
|
|
655
|
+
|
|
656
|
+
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
657
|
+
super().__init__()
|
|
658
|
+
self.dim = dim
|
|
659
|
+
self.theta = theta
|
|
660
|
+
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
661
|
+
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
662
|
+
|
|
663
|
+
def forward(self, seqlen: int) -> torch.Tensor:
|
|
664
|
+
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
|
|
665
|
+
freqs = torch.outer(seq, self.inv_freq)
|
|
666
|
+
return freqs
|
|
667
|
+
|
|
668
|
+
|
|
640
669
|
class Glm4vMoeisionMlp(nn.Module):
|
|
641
670
|
def __init__(self, config, bias: bool = False):
|
|
642
671
|
super().__init__()
|
|
@@ -671,20 +700,6 @@ class Glm4vMoeVisionPatchEmbed(nn.Module):
|
|
|
671
700
|
return hidden_states
|
|
672
701
|
|
|
673
702
|
|
|
674
|
-
class Glm4vMoeVisionRotaryEmbedding(nn.Module):
|
|
675
|
-
inv_freq: torch.Tensor # fix linting for `register_buffer`
|
|
676
|
-
|
|
677
|
-
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
678
|
-
super().__init__()
|
|
679
|
-
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
680
|
-
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
681
|
-
|
|
682
|
-
def forward(self, seqlen: int) -> torch.Tensor:
|
|
683
|
-
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
|
|
684
|
-
freqs = torch.outer(seq, self.inv_freq)
|
|
685
|
-
return freqs
|
|
686
|
-
|
|
687
|
-
|
|
688
703
|
class Glm4vMoeVisionPatchMerger(nn.Module):
|
|
689
704
|
def __init__(self, dim: int, context_dim: int, hidden_act: str, bias: bool = False) -> None:
|
|
690
705
|
super().__init__()
|
|
@@ -713,7 +728,6 @@ class Glm4vMoeVisionEmbeddings(nn.Module):
|
|
|
713
728
|
self.num_patches = (self.image_size // self.patch_size) ** 2
|
|
714
729
|
self.num_positions = self.num_patches
|
|
715
730
|
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
|
716
|
-
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
|
717
731
|
|
|
718
732
|
def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
|
|
719
733
|
"""
|
|
@@ -840,8 +854,8 @@ class Glm4vMoeVisionAttention(nn.Module):
|
|
|
840
854
|
if self.config._attn_implementation != "eager":
|
|
841
855
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
842
856
|
|
|
843
|
-
if self.config._attn_implementation
|
|
844
|
-
# Flash Attention
|
|
857
|
+
if "flash" in self.config._attn_implementation:
|
|
858
|
+
# Flash Attention: Use cu_seqlens for variable length attention
|
|
845
859
|
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
|
|
846
860
|
attn_output, _ = attention_interface(
|
|
847
861
|
self,
|
|
@@ -1763,6 +1777,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vMoePreTrainedModel, GenerationMixin)
|
|
|
1763
1777
|
pixel_values_videos=None,
|
|
1764
1778
|
image_grid_thw=None,
|
|
1765
1779
|
video_grid_thw=None,
|
|
1780
|
+
is_first_iteration=False,
|
|
1766
1781
|
**kwargs,
|
|
1767
1782
|
):
|
|
1768
1783
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1779,13 +1794,14 @@ class Glm4vMoeForConditionalGeneration(Glm4vMoePreTrainedModel, GenerationMixin)
|
|
|
1779
1794
|
image_grid_thw=image_grid_thw,
|
|
1780
1795
|
video_grid_thw=video_grid_thw,
|
|
1781
1796
|
use_cache=use_cache,
|
|
1797
|
+
is_first_iteration=is_first_iteration,
|
|
1782
1798
|
**kwargs,
|
|
1783
1799
|
)
|
|
1784
1800
|
|
|
1785
1801
|
# GLM-4.1V position_ids are prepareed with rope_deltas in forward
|
|
1786
1802
|
model_inputs["position_ids"] = None
|
|
1787
1803
|
|
|
1788
|
-
if
|
|
1804
|
+
if not is_first_iteration and use_cache:
|
|
1789
1805
|
model_inputs["pixel_values"] = None
|
|
1790
1806
|
model_inputs["pixel_values_videos"] = None
|
|
1791
1807
|
|
|
@@ -18,6 +18,7 @@ from typing import Optional, Union
|
|
|
18
18
|
import torch
|
|
19
19
|
import torch.nn as nn
|
|
20
20
|
|
|
21
|
+
from ... import initialization as init
|
|
21
22
|
from ...cache_utils import Cache, DynamicCache
|
|
22
23
|
from ...configuration_utils import PreTrainedConfig
|
|
23
24
|
from ...masking_utils import create_causal_mask
|
|
@@ -46,6 +47,7 @@ from ..glm4v.modeling_glm4v import (
|
|
|
46
47
|
Glm4vTextModel,
|
|
47
48
|
Glm4vTextRotaryEmbedding,
|
|
48
49
|
Glm4vVisionModel,
|
|
50
|
+
Glm4vVisionRotaryEmbedding,
|
|
49
51
|
rotate_half,
|
|
50
52
|
)
|
|
51
53
|
from ..qwen3_vl_moe.modeling_qwen3_vl_moe import (
|
|
@@ -479,11 +481,21 @@ class Glm4vMoePreTrainedModel(Glm4MoePreTrainedModel):
|
|
|
479
481
|
"router_logits": OutputRecorder(nn.Linear, layer_name="mlp.gate", index=0),
|
|
480
482
|
}
|
|
481
483
|
|
|
484
|
+
def _init_weights(self, module):
|
|
485
|
+
super()._init_weights(module)
|
|
486
|
+
if isinstance(module, Glm4vMoeVisionRotaryEmbedding):
|
|
487
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
488
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
489
|
+
|
|
482
490
|
|
|
483
491
|
class Glm4vMoeCausalLMOutputWithPast(Qwen3VLMoeCausalLMOutputWithPast):
|
|
484
492
|
pass
|
|
485
493
|
|
|
486
494
|
|
|
495
|
+
class Glm4vMoeVisionRotaryEmbedding(Glm4vVisionRotaryEmbedding):
|
|
496
|
+
pass
|
|
497
|
+
|
|
498
|
+
|
|
487
499
|
@auto_docstring
|
|
488
500
|
class Glm4vMoeVisionModel(Glm4vVisionModel):
|
|
489
501
|
pass
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 the HuggingFace Team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
from typing import TYPE_CHECKING
|
|
17
|
+
|
|
18
|
+
from ...utils import _LazyModule
|
|
19
|
+
from ...utils.import_utils import define_import_structure
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from .configuration_glmasr import *
|
|
24
|
+
from .modeling_glmasr import *
|
|
25
|
+
from .processing_glmasr import *
|
|
26
|
+
else:
|
|
27
|
+
import sys
|
|
28
|
+
|
|
29
|
+
_file = globals()["__file__"]
|
|
30
|
+
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 the HuggingFace Team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
from ...configuration_utils import PreTrainedConfig
|
|
17
|
+
from ..auto import CONFIG_MAPPING, AutoConfig
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class GlmAsrEncoderConfig(PreTrainedConfig):
|
|
21
|
+
r"""
|
|
22
|
+
This is the configuration class to store the configuration of a [`GlmAsrEncoder`]. It is used to instantiate a
|
|
23
|
+
glmasr audio encoder according to the specified arguments, defining the model architecture. Instantiating a
|
|
24
|
+
configuration with the defaults will yield a similar configuration to that of the audio encoder of the glmasr
|
|
25
|
+
architecture.
|
|
26
|
+
|
|
27
|
+
e.g. [zai-org/GLM-ASR-Nano-2512](https://huggingface.co/zai-org/GLM-ASR-Nano-2512)
|
|
28
|
+
|
|
29
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
30
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
hidden_size (`int`, *optional*, defaults to 1280):
|
|
34
|
+
Dimensionality of the hidden representations.
|
|
35
|
+
intermediate_size (`int`, *optional*, defaults to 5120):
|
|
36
|
+
Dimension of the MLP representations.
|
|
37
|
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
|
38
|
+
Number of hidden layers in the Transformer encoder.
|
|
39
|
+
num_attention_heads (`int`, *optional*, defaults to 20):
|
|
40
|
+
Number of attention heads for each attention layer in the Transformer encoder.
|
|
41
|
+
num_key_value_heads (`int`, *optional*):
|
|
42
|
+
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
|
43
|
+
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
|
44
|
+
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
|
45
|
+
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
|
46
|
+
by meanpooling all the original heads within that group. For more details, check out [this
|
|
47
|
+
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
|
|
48
|
+
`num_attention_heads`.
|
|
49
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
|
50
|
+
The non-linear activation function (function or string) in the encoder and pooler.
|
|
51
|
+
max_position_embeddings (`int`, *optional*, defaults to 1500):
|
|
52
|
+
The maximum sequence length that this model might ever be used with.
|
|
53
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
54
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
55
|
+
rope_parameters (`RopeParameters`, *optional*):
|
|
56
|
+
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
|
|
57
|
+
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
|
|
58
|
+
with longer `max_position_embeddings`.
|
|
59
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
60
|
+
The dropout ratio for the attention probabilities.
|
|
61
|
+
num_mel_bins (`int`, *optional*, defaults to 128):
|
|
62
|
+
Number of mel features used per input features. Should correspond to the value used in the
|
|
63
|
+
`GlmAsrProcessor` class.
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
>>> from transformers import GlmAsrEncoderConfig, GlmAsrEncoder
|
|
67
|
+
|
|
68
|
+
>>> # Initializing a GlmAsrEncoderConfig
|
|
69
|
+
>>> configuration = GlmAsrEncoderConfig()
|
|
70
|
+
|
|
71
|
+
>>> # Initializing a GlmAsrEncoder (with random weights)
|
|
72
|
+
>>> model = GlmAsrEncoder(configuration)
|
|
73
|
+
|
|
74
|
+
>>> # Accessing the model configuration
|
|
75
|
+
>>> configuration = model.config
|
|
76
|
+
```"""
|
|
77
|
+
|
|
78
|
+
model_type = "glmasr_encoder"
|
|
79
|
+
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
hidden_size=1280,
|
|
83
|
+
intermediate_size=5120,
|
|
84
|
+
num_hidden_layers=32,
|
|
85
|
+
num_attention_heads=20,
|
|
86
|
+
num_key_value_heads=None,
|
|
87
|
+
hidden_act="gelu",
|
|
88
|
+
max_position_embeddings=1500,
|
|
89
|
+
initializer_range=0.02,
|
|
90
|
+
rope_parameters=None,
|
|
91
|
+
attention_dropout=0.0,
|
|
92
|
+
num_mel_bins=128,
|
|
93
|
+
**kwargs,
|
|
94
|
+
):
|
|
95
|
+
self.hidden_size = hidden_size
|
|
96
|
+
self.intermediate_size = intermediate_size
|
|
97
|
+
self.num_hidden_layers = num_hidden_layers
|
|
98
|
+
self.num_attention_heads = num_attention_heads
|
|
99
|
+
if num_key_value_heads is None:
|
|
100
|
+
num_key_value_heads = num_attention_heads
|
|
101
|
+
self.num_key_value_heads = num_key_value_heads
|
|
102
|
+
self.hidden_act = hidden_act
|
|
103
|
+
self.initializer_range = initializer_range
|
|
104
|
+
self.head_dim = hidden_size // num_attention_heads
|
|
105
|
+
self.max_position_embeddings = max_position_embeddings
|
|
106
|
+
self.rope_parameters = rope_parameters
|
|
107
|
+
self.attention_dropout = attention_dropout
|
|
108
|
+
self.num_mel_bins = num_mel_bins
|
|
109
|
+
|
|
110
|
+
kwargs.setdefault("partial_rotary_factor", 0.5)
|
|
111
|
+
super().__init__(**kwargs)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class GlmAsrConfig(PreTrainedConfig):
|
|
115
|
+
r"""
|
|
116
|
+
This is the configuration class to store the configuration of a [`GlmAsrForConditionalGeneration`]. It is used to instantiate an
|
|
117
|
+
glmasr model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
118
|
+
with the defaults will yield a similar configuration to that of the glmasr-Mini-3B.
|
|
119
|
+
|
|
120
|
+
e.g. [zai-org/GLM-ASR-Nano-2512](https://huggingface.co/zai-org/GLM-ASR-Nano-2512)
|
|
121
|
+
|
|
122
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
123
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
audio_config (`Union[AutoConfig, dict]`, *optional*):
|
|
127
|
+
The config object or dictionary of the audio encoder.
|
|
128
|
+
text_config (`Union[AutoConfig, dict]`, *optional*):
|
|
129
|
+
The config object or dictionary of the text model.
|
|
130
|
+
audio_token_id (`int`, *optional*, defaults to 59260):
|
|
131
|
+
The audio token index to encode the audio prompt.
|
|
132
|
+
projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
|
|
133
|
+
The activation function (function or string) in the multi-modal projector.
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
>>> from transformers import GlmAsrForConditionalGeneration, GlmAsrConfig
|
|
137
|
+
|
|
138
|
+
>>> # Initializing a glmasr configuration
|
|
139
|
+
>>> configuration = GlmAsrConfig()
|
|
140
|
+
|
|
141
|
+
>>> # Initializing a GLM-ASR-Nano-2512 model with random weights
|
|
142
|
+
>>> model = GlmAsrForConditionalGeneration(configuration)
|
|
143
|
+
|
|
144
|
+
>>> # Accessing the model configuration
|
|
145
|
+
>>> configuration = model.config
|
|
146
|
+
```"""
|
|
147
|
+
|
|
148
|
+
model_type = "glmasr"
|
|
149
|
+
sub_configs = {"text_config": AutoConfig, "audio_config": AutoConfig}
|
|
150
|
+
|
|
151
|
+
_default_text_config_kwargs = {
|
|
152
|
+
"vocab_size": 59264,
|
|
153
|
+
"hidden_size": 2048,
|
|
154
|
+
"intermediate_size": 6144,
|
|
155
|
+
"num_hidden_layers": 28,
|
|
156
|
+
"num_attention_heads": 16,
|
|
157
|
+
"num_key_value_heads": 4,
|
|
158
|
+
"max_position_embeddings": 8192,
|
|
159
|
+
"rms_norm_eps": 1e-05,
|
|
160
|
+
"use_cache": True,
|
|
161
|
+
"eos_token_id": [59246, 59253, 59255],
|
|
162
|
+
"rope_parameters": {"rope_theta": 10000.0, "rope_type": "default"},
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
def __init__(
|
|
166
|
+
self,
|
|
167
|
+
audio_config=None,
|
|
168
|
+
text_config=None,
|
|
169
|
+
audio_token_id=59260,
|
|
170
|
+
projector_hidden_act="gelu",
|
|
171
|
+
**kwargs,
|
|
172
|
+
):
|
|
173
|
+
if isinstance(audio_config, dict):
|
|
174
|
+
audio_config["model_type"] = audio_config.get("model_type", "glmasr_encoder")
|
|
175
|
+
audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config)
|
|
176
|
+
elif audio_config is None:
|
|
177
|
+
audio_config = CONFIG_MAPPING["glmasr_encoder"]()
|
|
178
|
+
self.audio_config = audio_config
|
|
179
|
+
|
|
180
|
+
if isinstance(text_config, dict):
|
|
181
|
+
text_config["model_type"] = text_config.get("model_type", "llama")
|
|
182
|
+
text_config = CONFIG_MAPPING[text_config["model_type"]](
|
|
183
|
+
**{**self._default_text_config_kwargs, **text_config}
|
|
184
|
+
)
|
|
185
|
+
elif text_config is None:
|
|
186
|
+
text_config = CONFIG_MAPPING["llama"](**self._default_text_config_kwargs)
|
|
187
|
+
self.text_config = text_config
|
|
188
|
+
|
|
189
|
+
self.vocab_size = text_config.vocab_size
|
|
190
|
+
self.hidden_size = text_config.hidden_size
|
|
191
|
+
self.audio_token_id = audio_token_id
|
|
192
|
+
self.projector_hidden_act = projector_hidden_act
|
|
193
|
+
|
|
194
|
+
super().__init__(**kwargs)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
__all__ = ["GlmAsrEncoderConfig", "GlmAsrConfig"]
|