transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -30,7 +30,7 @@ from ... import initialization as init
|
|
|
30
30
|
from ...activations import ACT2FN
|
|
31
31
|
from ...cache_utils import Cache, DynamicCache
|
|
32
32
|
from ...generation import GenerationMixin
|
|
33
|
-
from ...integrations import use_kernel_forward_from_hub
|
|
33
|
+
from ...integrations import use_experts_implementation, use_kernel_forward_from_hub, use_kernelized_func
|
|
34
34
|
from ...masking_utils import create_causal_mask
|
|
35
35
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
36
36
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
@@ -38,8 +38,8 @@ from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
|
|
38
38
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
39
39
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
40
40
|
from ...processing_utils import Unpack
|
|
41
|
-
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
42
|
-
from ...utils.generic import check_model_inputs
|
|
41
|
+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
|
|
42
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
43
43
|
from .configuration_glm4_moe import Glm4MoeConfig
|
|
44
44
|
|
|
45
45
|
|
|
@@ -60,7 +60,7 @@ class Glm4MoeRotaryEmbedding(nn.Module):
|
|
|
60
60
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
61
61
|
|
|
62
62
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
63
|
-
self.original_inv_freq =
|
|
63
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
64
64
|
|
|
65
65
|
@staticmethod
|
|
66
66
|
def compute_default_rope_parameters(
|
|
@@ -101,7 +101,7 @@ class Glm4MoeRotaryEmbedding(nn.Module):
|
|
|
101
101
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
102
102
|
|
|
103
103
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
104
|
-
with
|
|
104
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
105
105
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
106
106
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
107
107
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -193,6 +193,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
|
|
193
193
|
return q_embed, k_embed
|
|
194
194
|
|
|
195
195
|
|
|
196
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
196
197
|
class Glm4MoeAttention(nn.Module):
|
|
197
198
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
198
199
|
|
|
@@ -331,6 +332,7 @@ class Glm4MoeRMSNorm(nn.Module):
|
|
|
331
332
|
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
|
|
332
333
|
|
|
333
334
|
|
|
335
|
+
@use_experts_implementation
|
|
334
336
|
class Glm4MoeNaiveMoe(nn.Module):
|
|
335
337
|
"""Collection of expert weights stored as 3D tensors."""
|
|
336
338
|
|
|
@@ -338,7 +340,7 @@ class Glm4MoeNaiveMoe(nn.Module):
|
|
|
338
340
|
super().__init__()
|
|
339
341
|
self.num_experts = config.num_local_experts
|
|
340
342
|
self.hidden_dim = config.hidden_size
|
|
341
|
-
self.intermediate_dim = config.
|
|
343
|
+
self.intermediate_dim = config.moe_intermediate_size
|
|
342
344
|
self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, 2 * self.intermediate_dim, self.hidden_dim))
|
|
343
345
|
self.down_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, self.intermediate_dim))
|
|
344
346
|
self.act_fn = ACT2FN[config.hidden_act]
|
|
@@ -485,18 +487,22 @@ class Glm4MoePreTrainedModel(PreTrainedModel):
|
|
|
485
487
|
_supports_flash_attn = True
|
|
486
488
|
_supports_sdpa = True
|
|
487
489
|
_supports_flex_attn = True
|
|
488
|
-
_can_compile_fullgraph =
|
|
490
|
+
_can_compile_fullgraph = (
|
|
491
|
+
is_grouped_mm_available()
|
|
492
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
489
493
|
_supports_attention_backend = True
|
|
490
494
|
_can_record_outputs = {
|
|
491
495
|
"hidden_states": Glm4MoeDecoderLayer,
|
|
492
496
|
"attentions": Glm4MoeAttention,
|
|
493
497
|
}
|
|
498
|
+
_keep_in_fp32_modules_strict = ["e_score_correction_bias"]
|
|
494
499
|
|
|
495
500
|
@torch.no_grad()
|
|
496
501
|
def _init_weights(self, module):
|
|
497
502
|
super()._init_weights(module)
|
|
498
503
|
if isinstance(module, Glm4MoeTopkRouter):
|
|
499
504
|
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
|
|
505
|
+
init.zeros_(module.e_score_correction_bias)
|
|
500
506
|
elif isinstance(module, Glm4MoeNaiveMoe):
|
|
501
507
|
init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
|
|
502
508
|
init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
|
|
@@ -234,7 +234,9 @@ class Glm4vTextConfig(PreTrainedConfig):
|
|
|
234
234
|
self.attention_dropout = attention_dropout
|
|
235
235
|
self.rope_parameters = rope_parameters
|
|
236
236
|
|
|
237
|
-
super().__init__(
|
|
237
|
+
super().__init__(
|
|
238
|
+
tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
|
|
239
|
+
)
|
|
238
240
|
|
|
239
241
|
|
|
240
242
|
class Glm4vConfig(PreTrainedConfig):
|
|
@@ -353,7 +353,6 @@ class Glm4vImageProcessor(BaseImageProcessor):
|
|
|
353
353
|
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
|
354
354
|
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
|
|
355
355
|
`True`.
|
|
356
|
-
The max pixels of the image to resize the image.
|
|
357
356
|
patch_size (`int`, *optional*, defaults to `self.patch_size`):
|
|
358
357
|
The spatial patch size of the vision encoder.
|
|
359
358
|
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
|
|
@@ -380,12 +379,9 @@ class Glm4vImageProcessor(BaseImageProcessor):
|
|
|
380
379
|
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
|
381
380
|
|
|
382
381
|
"""
|
|
383
|
-
# Try to use config values if set, otherwise fallback to global defaults
|
|
384
382
|
size = size if size is not None else self.size
|
|
385
383
|
if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
|
|
386
384
|
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
|
387
|
-
elif size is None:
|
|
388
|
-
size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}
|
|
389
385
|
|
|
390
386
|
do_resize = do_resize if do_resize is not None else self.do_resize
|
|
391
387
|
resample = resample if resample is not None else self.resample
|
|
@@ -28,6 +28,7 @@ import torch.nn as nn
|
|
|
28
28
|
import torch.nn.functional as F
|
|
29
29
|
from torch.nn import LayerNorm
|
|
30
30
|
|
|
31
|
+
from ... import initialization as init
|
|
31
32
|
from ...activations import ACT2FN
|
|
32
33
|
from ...cache_utils import Cache, DynamicCache
|
|
33
34
|
from ...generation import GenerationMixin
|
|
@@ -40,7 +41,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
40
41
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
41
42
|
from ...processing_utils import Unpack
|
|
42
43
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling
|
|
43
|
-
from ...utils.generic import check_model_inputs
|
|
44
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
44
45
|
from .configuration_glm4v import Glm4vConfig, Glm4vTextConfig, Glm4vVisionConfig
|
|
45
46
|
|
|
46
47
|
|
|
@@ -104,6 +105,8 @@ class Glm4vVisionRotaryEmbedding(nn.Module):
|
|
|
104
105
|
|
|
105
106
|
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
106
107
|
super().__init__()
|
|
108
|
+
self.dim = dim
|
|
109
|
+
self.theta = theta
|
|
107
110
|
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
108
111
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
109
112
|
|
|
@@ -141,7 +144,6 @@ class Glm4vVisionEmbeddings(nn.Module):
|
|
|
141
144
|
self.num_patches = (self.image_size // self.patch_size) ** 2
|
|
142
145
|
self.num_positions = self.num_patches
|
|
143
146
|
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
|
144
|
-
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
|
145
147
|
|
|
146
148
|
def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
|
|
147
149
|
"""
|
|
@@ -313,8 +315,8 @@ class Glm4vVisionAttention(nn.Module):
|
|
|
313
315
|
if self.config._attn_implementation != "eager":
|
|
314
316
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
315
317
|
|
|
316
|
-
if self.config._attn_implementation
|
|
317
|
-
# Flash Attention
|
|
318
|
+
if "flash" in self.config._attn_implementation:
|
|
319
|
+
# Flash Attention: Use cu_seqlens for variable length attention
|
|
318
320
|
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
|
|
319
321
|
attn_output, _ = attention_interface(
|
|
320
322
|
self,
|
|
@@ -403,7 +405,7 @@ class Glm4vTextRotaryEmbedding(nn.Module):
|
|
|
403
405
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
404
406
|
|
|
405
407
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
406
|
-
self.original_inv_freq =
|
|
408
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
407
409
|
|
|
408
410
|
@staticmethod
|
|
409
411
|
def compute_default_rope_parameters(
|
|
@@ -446,7 +448,7 @@ class Glm4vTextRotaryEmbedding(nn.Module):
|
|
|
446
448
|
position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
|
|
447
449
|
|
|
448
450
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
449
|
-
with
|
|
451
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
450
452
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
|
|
451
453
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
452
454
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -705,6 +707,12 @@ class Glm4vPreTrainedModel(PreTrainedModel):
|
|
|
705
707
|
"attentions": Glm4vTextAttention,
|
|
706
708
|
}
|
|
707
709
|
|
|
710
|
+
def _init_weights(self, module):
|
|
711
|
+
super()._init_weights(module)
|
|
712
|
+
if isinstance(module, Glm4vVisionRotaryEmbedding):
|
|
713
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
714
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
715
|
+
|
|
708
716
|
|
|
709
717
|
class Glm4vVisionModel(Glm4vPreTrainedModel):
|
|
710
718
|
config: Glm4vVisionConfig
|
|
@@ -768,7 +776,7 @@ class Glm4vVisionModel(Glm4vPreTrainedModel):
|
|
|
768
776
|
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
|
|
769
777
|
return rotary_pos_emb, pos_ids
|
|
770
778
|
|
|
771
|
-
def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
|
|
779
|
+
def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
|
|
772
780
|
"""
|
|
773
781
|
Args:
|
|
774
782
|
hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
|
|
@@ -1487,6 +1495,7 @@ class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
|
|
|
1487
1495
|
pixel_values_videos=None,
|
|
1488
1496
|
image_grid_thw=None,
|
|
1489
1497
|
video_grid_thw=None,
|
|
1498
|
+
is_first_iteration=False,
|
|
1490
1499
|
**kwargs,
|
|
1491
1500
|
):
|
|
1492
1501
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1503,13 +1512,14 @@ class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
|
|
|
1503
1512
|
image_grid_thw=image_grid_thw,
|
|
1504
1513
|
video_grid_thw=video_grid_thw,
|
|
1505
1514
|
use_cache=use_cache,
|
|
1515
|
+
is_first_iteration=is_first_iteration,
|
|
1506
1516
|
**kwargs,
|
|
1507
1517
|
)
|
|
1508
1518
|
|
|
1509
1519
|
# GLM-4.1V position_ids are prepareed with rope_deltas in forward
|
|
1510
1520
|
model_inputs["position_ids"] = None
|
|
1511
1521
|
|
|
1512
|
-
if
|
|
1522
|
+
if not is_first_iteration and use_cache:
|
|
1513
1523
|
model_inputs["pixel_values"] = None
|
|
1514
1524
|
model_inputs["pixel_values_videos"] = None
|
|
1515
1525
|
|
|
@@ -22,6 +22,7 @@ import torch.nn as nn
|
|
|
22
22
|
import torch.nn.functional as F
|
|
23
23
|
from torch.nn import LayerNorm
|
|
24
24
|
|
|
25
|
+
from ... import initialization as init
|
|
25
26
|
from ...activations import ACT2FN
|
|
26
27
|
from ...cache_utils import Cache, DynamicCache
|
|
27
28
|
from ...configuration_utils import PreTrainedConfig
|
|
@@ -32,11 +33,11 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
|
32
33
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
33
34
|
from ...modeling_outputs import BaseModelOutputWithPast
|
|
34
35
|
from ...modeling_rope_utils import RopeParameters
|
|
35
|
-
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
|
36
|
+
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
36
37
|
from ...processing_utils import Unpack
|
|
37
38
|
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
|
38
39
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
|
|
39
|
-
from ...utils.generic import check_model_inputs
|
|
40
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
40
41
|
from ...video_utils import VideoInput
|
|
41
42
|
from ..glm4.modeling_glm4 import Glm4MLP, Glm4RMSNorm, Glm4RotaryEmbedding, eager_attention_forward
|
|
42
43
|
from ..qwen2_5_vl.modeling_qwen2_5_vl import (
|
|
@@ -271,7 +272,9 @@ class Glm4vTextConfig(PreTrainedConfig):
|
|
|
271
272
|
self.attention_dropout = attention_dropout
|
|
272
273
|
self.rope_parameters = rope_parameters
|
|
273
274
|
|
|
274
|
-
super().__init__(
|
|
275
|
+
super().__init__(
|
|
276
|
+
tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
|
|
277
|
+
)
|
|
275
278
|
|
|
276
279
|
|
|
277
280
|
class Glm4vConfig(PreTrainedConfig):
|
|
@@ -407,7 +410,6 @@ class Glm4vVisionEmbeddings(nn.Module):
|
|
|
407
410
|
self.num_patches = (self.image_size // self.patch_size) ** 2
|
|
408
411
|
self.num_positions = self.num_patches
|
|
409
412
|
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
|
410
|
-
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
|
411
413
|
|
|
412
414
|
def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
|
|
413
415
|
"""
|
|
@@ -509,7 +511,7 @@ class Glm4vTextRotaryEmbedding(Glm4RotaryEmbedding):
|
|
|
509
511
|
position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
|
|
510
512
|
|
|
511
513
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
512
|
-
with
|
|
514
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
513
515
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
|
|
514
516
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
515
517
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -723,6 +725,12 @@ class Glm4vPreTrainedModel(Qwen2_5_VLPreTrainedModel):
|
|
|
723
725
|
"attentions": Glm4vTextAttention,
|
|
724
726
|
}
|
|
725
727
|
|
|
728
|
+
def _init_weights(self, module):
|
|
729
|
+
PreTrainedModel._init_weights(self, module)
|
|
730
|
+
if isinstance(module, Glm4vVisionRotaryEmbedding):
|
|
731
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
732
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
733
|
+
|
|
726
734
|
|
|
727
735
|
class Glm4vVisionModel(Glm4vPreTrainedModel):
|
|
728
736
|
config: Glm4vVisionConfig
|
|
@@ -786,7 +794,7 @@ class Glm4vVisionModel(Glm4vPreTrainedModel):
|
|
|
786
794
|
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
|
|
787
795
|
return rotary_pos_emb, pos_ids
|
|
788
796
|
|
|
789
|
-
def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
|
|
797
|
+
def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
|
|
790
798
|
"""
|
|
791
799
|
Args:
|
|
792
800
|
hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
|
|
@@ -1412,6 +1420,7 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
|
|
1412
1420
|
pixel_values_videos=None,
|
|
1413
1421
|
image_grid_thw=None,
|
|
1414
1422
|
video_grid_thw=None,
|
|
1423
|
+
is_first_iteration=False,
|
|
1415
1424
|
**kwargs,
|
|
1416
1425
|
):
|
|
1417
1426
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1428,13 +1437,14 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
|
|
1428
1437
|
image_grid_thw=image_grid_thw,
|
|
1429
1438
|
video_grid_thw=video_grid_thw,
|
|
1430
1439
|
use_cache=use_cache,
|
|
1440
|
+
is_first_iteration=is_first_iteration,
|
|
1431
1441
|
**kwargs,
|
|
1432
1442
|
)
|
|
1433
1443
|
|
|
1434
1444
|
# GLM-4.1V position_ids are prepareed with rope_deltas in forward
|
|
1435
1445
|
model_inputs["position_ids"] = None
|
|
1436
1446
|
|
|
1437
|
-
if
|
|
1447
|
+
if not is_first_iteration and use_cache:
|
|
1438
1448
|
model_inputs["pixel_values"] = None
|
|
1439
1449
|
model_inputs["pixel_values_videos"] = None
|
|
1440
1450
|
|
|
@@ -280,7 +280,9 @@ class Glm4vMoeTextConfig(PreTrainedConfig):
|
|
|
280
280
|
self.first_k_dense_replace = first_k_dense_replace
|
|
281
281
|
self.norm_topk_prob = norm_topk_prob
|
|
282
282
|
self.router_aux_loss_coef = router_aux_loss_coef
|
|
283
|
-
super().__init__(
|
|
283
|
+
super().__init__(
|
|
284
|
+
tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
|
|
285
|
+
)
|
|
284
286
|
|
|
285
287
|
|
|
286
288
|
class Glm4vMoeConfig(PreTrainedConfig):
|
|
@@ -32,7 +32,7 @@ from ... import initialization as init
|
|
|
32
32
|
from ...activations import ACT2FN
|
|
33
33
|
from ...cache_utils import Cache, DynamicCache
|
|
34
34
|
from ...generation import GenerationMixin
|
|
35
|
-
from ...integrations import use_kernel_forward_from_hub
|
|
35
|
+
from ...integrations import use_experts_implementation, use_kernel_forward_from_hub, use_kernelized_func
|
|
36
36
|
from ...masking_utils import create_causal_mask
|
|
37
37
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
38
38
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
@@ -40,8 +40,14 @@ from ...modeling_outputs import ModelOutput, MoeModelOutputWithPast
|
|
|
40
40
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
41
41
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
42
42
|
from ...processing_utils import Unpack
|
|
43
|
-
from ...utils import
|
|
44
|
-
|
|
43
|
+
from ...utils import (
|
|
44
|
+
TransformersKwargs,
|
|
45
|
+
auto_docstring,
|
|
46
|
+
can_return_tuple,
|
|
47
|
+
is_grouped_mm_available,
|
|
48
|
+
is_torchdynamo_compiling,
|
|
49
|
+
)
|
|
50
|
+
from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
|
|
45
51
|
from .configuration_glm4v_moe import Glm4vMoeConfig, Glm4vMoeTextConfig, Glm4vMoeVisionConfig
|
|
46
52
|
|
|
47
53
|
|
|
@@ -107,7 +113,7 @@ class Glm4vMoeTextRotaryEmbedding(nn.Module):
|
|
|
107
113
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
108
114
|
|
|
109
115
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
110
|
-
self.original_inv_freq =
|
|
116
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
111
117
|
|
|
112
118
|
@staticmethod
|
|
113
119
|
def compute_default_rope_parameters(
|
|
@@ -150,7 +156,7 @@ class Glm4vMoeTextRotaryEmbedding(nn.Module):
|
|
|
150
156
|
position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
|
|
151
157
|
|
|
152
158
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
153
|
-
with
|
|
159
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
154
160
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
|
|
155
161
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
156
162
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -299,6 +305,7 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim
|
|
|
299
305
|
return q_embed, k_embed
|
|
300
306
|
|
|
301
307
|
|
|
308
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
302
309
|
class Glm4vMoeTextAttention(nn.Module):
|
|
303
310
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
304
311
|
|
|
@@ -322,7 +329,6 @@ class Glm4vMoeTextAttention(nn.Module):
|
|
|
322
329
|
config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
|
|
323
330
|
)
|
|
324
331
|
self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
|
|
325
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
326
332
|
self.rope_parameters = config.rope_parameters
|
|
327
333
|
|
|
328
334
|
def forward(
|
|
@@ -395,6 +401,7 @@ class Glm4vMoeTextTopkRouter(nn.Module):
|
|
|
395
401
|
return router_logits
|
|
396
402
|
|
|
397
403
|
|
|
404
|
+
@use_experts_implementation
|
|
398
405
|
class Glm4vMoeTextNaiveMoe(nn.Module):
|
|
399
406
|
"""Collection of expert weights stored as 3D tensors."""
|
|
400
407
|
|
|
@@ -402,7 +409,7 @@ class Glm4vMoeTextNaiveMoe(nn.Module):
|
|
|
402
409
|
super().__init__()
|
|
403
410
|
self.num_experts = config.num_local_experts
|
|
404
411
|
self.hidden_dim = config.hidden_size
|
|
405
|
-
self.intermediate_dim = config.
|
|
412
|
+
self.intermediate_dim = config.moe_intermediate_size
|
|
406
413
|
self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, 2 * self.intermediate_dim, self.hidden_dim))
|
|
407
414
|
self.down_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, self.intermediate_dim))
|
|
408
415
|
self.act_fn = ACT2FN[config.hidden_act]
|
|
@@ -586,7 +593,9 @@ class Glm4vMoePreTrainedModel(PreTrainedModel):
|
|
|
586
593
|
_supports_flash_attn = True
|
|
587
594
|
_supports_sdpa = True
|
|
588
595
|
_supports_flex_attn = True
|
|
589
|
-
_can_compile_fullgraph =
|
|
596
|
+
_can_compile_fullgraph = (
|
|
597
|
+
is_grouped_mm_available()
|
|
598
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
590
599
|
_supports_attention_backend = True
|
|
591
600
|
|
|
592
601
|
_can_record_outputs = {
|
|
@@ -594,6 +603,7 @@ class Glm4vMoePreTrainedModel(PreTrainedModel):
|
|
|
594
603
|
"attentions": Glm4vMoeTextAttention,
|
|
595
604
|
"router_logits": OutputRecorder(nn.Linear, layer_name="mlp.gate", index=0),
|
|
596
605
|
}
|
|
606
|
+
_keep_in_fp32_modules_strict = ["e_score_correction_bias"]
|
|
597
607
|
input_modalities = ("text", "image", "video")
|
|
598
608
|
|
|
599
609
|
@torch.no_grad()
|
|
@@ -601,9 +611,13 @@ class Glm4vMoePreTrainedModel(PreTrainedModel):
|
|
|
601
611
|
super()._init_weights(module)
|
|
602
612
|
if isinstance(module, Glm4vMoeTextTopkRouter):
|
|
603
613
|
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
|
|
614
|
+
init.zeros_(module.e_score_correction_bias)
|
|
604
615
|
elif isinstance(module, Glm4vMoeTextNaiveMoe):
|
|
605
616
|
init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
|
|
606
617
|
init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
|
|
618
|
+
if isinstance(module, Glm4vMoeVisionRotaryEmbedding):
|
|
619
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
620
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
607
621
|
|
|
608
622
|
|
|
609
623
|
@dataclass
|
|
@@ -636,6 +650,22 @@ class Glm4vMoeCausalLMOutputWithPast(ModelOutput):
|
|
|
636
650
|
aux_loss: Optional[torch.FloatTensor] = None
|
|
637
651
|
|
|
638
652
|
|
|
653
|
+
class Glm4vMoeVisionRotaryEmbedding(nn.Module):
|
|
654
|
+
inv_freq: torch.Tensor # fix linting for `register_buffer`
|
|
655
|
+
|
|
656
|
+
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
657
|
+
super().__init__()
|
|
658
|
+
self.dim = dim
|
|
659
|
+
self.theta = theta
|
|
660
|
+
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
661
|
+
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
662
|
+
|
|
663
|
+
def forward(self, seqlen: int) -> torch.Tensor:
|
|
664
|
+
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
|
|
665
|
+
freqs = torch.outer(seq, self.inv_freq)
|
|
666
|
+
return freqs
|
|
667
|
+
|
|
668
|
+
|
|
639
669
|
class Glm4vMoeisionMlp(nn.Module):
|
|
640
670
|
def __init__(self, config, bias: bool = False):
|
|
641
671
|
super().__init__()
|
|
@@ -670,20 +700,6 @@ class Glm4vMoeVisionPatchEmbed(nn.Module):
|
|
|
670
700
|
return hidden_states
|
|
671
701
|
|
|
672
702
|
|
|
673
|
-
class Glm4vMoeVisionRotaryEmbedding(nn.Module):
|
|
674
|
-
inv_freq: torch.Tensor # fix linting for `register_buffer`
|
|
675
|
-
|
|
676
|
-
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
677
|
-
super().__init__()
|
|
678
|
-
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
679
|
-
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
680
|
-
|
|
681
|
-
def forward(self, seqlen: int) -> torch.Tensor:
|
|
682
|
-
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
|
|
683
|
-
freqs = torch.outer(seq, self.inv_freq)
|
|
684
|
-
return freqs
|
|
685
|
-
|
|
686
|
-
|
|
687
703
|
class Glm4vMoeVisionPatchMerger(nn.Module):
|
|
688
704
|
def __init__(self, dim: int, context_dim: int, hidden_act: str, bias: bool = False) -> None:
|
|
689
705
|
super().__init__()
|
|
@@ -712,7 +728,6 @@ class Glm4vMoeVisionEmbeddings(nn.Module):
|
|
|
712
728
|
self.num_patches = (self.image_size // self.patch_size) ** 2
|
|
713
729
|
self.num_positions = self.num_patches
|
|
714
730
|
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
|
715
|
-
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
|
716
731
|
|
|
717
732
|
def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
|
|
718
733
|
"""
|
|
@@ -839,8 +854,8 @@ class Glm4vMoeVisionAttention(nn.Module):
|
|
|
839
854
|
if self.config._attn_implementation != "eager":
|
|
840
855
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
841
856
|
|
|
842
|
-
if self.config._attn_implementation
|
|
843
|
-
# Flash Attention
|
|
857
|
+
if "flash" in self.config._attn_implementation:
|
|
858
|
+
# Flash Attention: Use cu_seqlens for variable length attention
|
|
844
859
|
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
|
|
845
860
|
attn_output, _ = attention_interface(
|
|
846
861
|
self,
|
|
@@ -975,7 +990,7 @@ class Glm4vMoeVisionModel(Glm4vMoePreTrainedModel):
|
|
|
975
990
|
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
|
|
976
991
|
return rotary_pos_emb, pos_ids
|
|
977
992
|
|
|
978
|
-
def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
|
|
993
|
+
def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
|
|
979
994
|
"""
|
|
980
995
|
Args:
|
|
981
996
|
hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
|
|
@@ -1762,6 +1777,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vMoePreTrainedModel, GenerationMixin)
|
|
|
1762
1777
|
pixel_values_videos=None,
|
|
1763
1778
|
image_grid_thw=None,
|
|
1764
1779
|
video_grid_thw=None,
|
|
1780
|
+
is_first_iteration=False,
|
|
1765
1781
|
**kwargs,
|
|
1766
1782
|
):
|
|
1767
1783
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1778,13 +1794,14 @@ class Glm4vMoeForConditionalGeneration(Glm4vMoePreTrainedModel, GenerationMixin)
|
|
|
1778
1794
|
image_grid_thw=image_grid_thw,
|
|
1779
1795
|
video_grid_thw=video_grid_thw,
|
|
1780
1796
|
use_cache=use_cache,
|
|
1797
|
+
is_first_iteration=is_first_iteration,
|
|
1781
1798
|
**kwargs,
|
|
1782
1799
|
)
|
|
1783
1800
|
|
|
1784
1801
|
# GLM-4.1V position_ids are prepareed with rope_deltas in forward
|
|
1785
1802
|
model_inputs["position_ids"] = None
|
|
1786
1803
|
|
|
1787
|
-
if
|
|
1804
|
+
if not is_first_iteration and use_cache:
|
|
1788
1805
|
model_inputs["pixel_values"] = None
|
|
1789
1806
|
model_inputs["pixel_values_videos"] = None
|
|
1790
1807
|
|
|
@@ -18,6 +18,7 @@ from typing import Optional, Union
|
|
|
18
18
|
import torch
|
|
19
19
|
import torch.nn as nn
|
|
20
20
|
|
|
21
|
+
from ... import initialization as init
|
|
21
22
|
from ...cache_utils import Cache, DynamicCache
|
|
22
23
|
from ...configuration_utils import PreTrainedConfig
|
|
23
24
|
from ...masking_utils import create_causal_mask
|
|
@@ -46,6 +47,7 @@ from ..glm4v.modeling_glm4v import (
|
|
|
46
47
|
Glm4vTextModel,
|
|
47
48
|
Glm4vTextRotaryEmbedding,
|
|
48
49
|
Glm4vVisionModel,
|
|
50
|
+
Glm4vVisionRotaryEmbedding,
|
|
49
51
|
rotate_half,
|
|
50
52
|
)
|
|
51
53
|
from ..qwen3_vl_moe.modeling_qwen3_vl_moe import (
|
|
@@ -227,7 +229,7 @@ class Glm4vMoeTextConfig(Glm4MoeConfig, RotaryEmbeddingConfigMixin):
|
|
|
227
229
|
self.norm_topk_prob = norm_topk_prob
|
|
228
230
|
self.router_aux_loss_coef = router_aux_loss_coef
|
|
229
231
|
PreTrainedConfig.__init__(
|
|
230
|
-
self, tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"
|
|
232
|
+
self, tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
|
|
231
233
|
)
|
|
232
234
|
|
|
233
235
|
|
|
@@ -479,11 +481,21 @@ class Glm4vMoePreTrainedModel(Glm4MoePreTrainedModel):
|
|
|
479
481
|
"router_logits": OutputRecorder(nn.Linear, layer_name="mlp.gate", index=0),
|
|
480
482
|
}
|
|
481
483
|
|
|
484
|
+
def _init_weights(self, module):
|
|
485
|
+
super()._init_weights(module)
|
|
486
|
+
if isinstance(module, Glm4vMoeVisionRotaryEmbedding):
|
|
487
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
488
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
489
|
+
|
|
482
490
|
|
|
483
491
|
class Glm4vMoeCausalLMOutputWithPast(Qwen3VLMoeCausalLMOutputWithPast):
|
|
484
492
|
pass
|
|
485
493
|
|
|
486
494
|
|
|
495
|
+
class Glm4vMoeVisionRotaryEmbedding(Glm4vVisionRotaryEmbedding):
|
|
496
|
+
pass
|
|
497
|
+
|
|
498
|
+
|
|
487
499
|
@auto_docstring
|
|
488
500
|
class Glm4vMoeVisionModel(Glm4vVisionModel):
|
|
489
501
|
pass
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 the HuggingFace Team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
from typing import TYPE_CHECKING
|
|
17
|
+
|
|
18
|
+
from ...utils import _LazyModule
|
|
19
|
+
from ...utils.import_utils import define_import_structure
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from .configuration_glmasr import *
|
|
24
|
+
from .modeling_glmasr import *
|
|
25
|
+
from .processing_glmasr import *
|
|
26
|
+
else:
|
|
27
|
+
import sys
|
|
28
|
+
|
|
29
|
+
_file = globals()["__file__"]
|
|
30
|
+
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|