transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +20 -1
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +68 -5
- transformers/core_model_loading.py +201 -35
- transformers/dependency_versions_table.py +1 -1
- transformers/feature_extraction_utils.py +54 -22
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +162 -122
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +101 -64
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +2 -12
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +12 -0
- transformers/integrations/accelerate.py +44 -111
- transformers/integrations/aqlm.py +3 -5
- transformers/integrations/awq.py +2 -5
- transformers/integrations/bitnet.py +5 -8
- transformers/integrations/bitsandbytes.py +16 -15
- transformers/integrations/deepspeed.py +18 -3
- transformers/integrations/eetq.py +3 -5
- transformers/integrations/fbgemm_fp8.py +1 -1
- transformers/integrations/finegrained_fp8.py +6 -16
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/higgs.py +2 -5
- transformers/integrations/hub_kernels.py +23 -5
- transformers/integrations/integration_utils.py +35 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +4 -10
- transformers/integrations/peft.py +5 -0
- transformers/integrations/quanto.py +5 -2
- transformers/integrations/spqr.py +3 -5
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/vptq.py +3 -5
- transformers/modeling_gguf_pytorch_utils.py +66 -19
- transformers/modeling_rope_utils.py +78 -81
- transformers/modeling_utils.py +583 -503
- transformers/models/__init__.py +19 -0
- transformers/models/afmoe/modeling_afmoe.py +7 -16
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/align/modeling_align.py +12 -6
- transformers/models/altclip/modeling_altclip.py +7 -3
- transformers/models/apertus/modeling_apertus.py +4 -2
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +8 -4
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +27 -0
- transformers/models/auto/feature_extraction_auto.py +7 -3
- transformers/models/auto/image_processing_auto.py +4 -2
- transformers/models/auto/modeling_auto.py +31 -0
- transformers/models/auto/processing_auto.py +4 -0
- transformers/models/auto/tokenization_auto.py +132 -153
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +18 -19
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +9 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +3 -0
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
- transformers/models/bit/modeling_bit.py +5 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blenderbot/modeling_blenderbot.py +7 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +8 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -0
- transformers/models/bloom/modeling_bloom.py +13 -44
- transformers/models/blt/modeling_blt.py +162 -2
- transformers/models/blt/modular_blt.py +168 -3
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +6 -0
- transformers/models/bros/modeling_bros.py +8 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/canine/modeling_canine.py +6 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +9 -4
- transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +25 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clipseg/modeling_clipseg.py +4 -0
- transformers/models/clvp/modeling_clvp.py +14 -3
- transformers/models/code_llama/tokenization_code_llama.py +1 -1
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
- transformers/models/convbert/modeling_convbert.py +3 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +3 -1
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +14 -2
- transformers/models/cvt/modeling_cvt.py +5 -1
- transformers/models/cwm/modeling_cwm.py +1 -1
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +46 -39
- transformers/models/d_fine/modular_d_fine.py +15 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +1 -1
- transformers/models/dac/modeling_dac.py +4 -4
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +1 -1
- transformers/models/deberta/modeling_deberta.py +2 -0
- transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
- transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
- transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +8 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +12 -1
- transformers/models/dia/modular_dia.py +11 -0
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +3 -3
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
- transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +16 -12
- transformers/models/dots1/modeling_dots1.py +14 -5
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +5 -2
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +5 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +8 -2
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt_fast.py +46 -14
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +6 -1
- transformers/models/evolla/modeling_evolla.py +9 -1
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/falcon/modeling_falcon.py +3 -3
- transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
- transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
- transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +14 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +4 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
- transformers/models/florence2/modeling_florence2.py +20 -3
- transformers/models/florence2/modular_florence2.py +13 -0
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +16 -0
- transformers/models/gemma/modeling_gemma.py +10 -12
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma2/modular_gemma2.py +1 -1
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +28 -7
- transformers/models/gemma3/modular_gemma3.py +26 -6
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +47 -9
- transformers/models/gemma3n/modular_gemma3n.py +51 -9
- transformers/models/git/modeling_git.py +181 -126
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +15 -5
- transformers/models/glm4v/modular_glm4v.py +11 -3
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
- transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +8 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
- transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
- transformers/models/gptj/modeling_gptj.py +15 -6
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +2 -3
- transformers/models/granitemoe/modular_granitemoe.py +1 -2
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
- transformers/models/groupvit/modeling_groupvit.py +6 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
- transformers/models/hubert/modeling_hubert.py +4 -0
- transformers/models/hubert/modular_hubert.py +4 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +16 -0
- transformers/models/idefics/modeling_idefics.py +10 -0
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +9 -2
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +11 -8
- transformers/models/internvl/modular_internvl.py +5 -9
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +24 -19
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +15 -7
- transformers/models/janus/modular_janus.py +16 -7
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +14 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/configuration_lasr.py +4 -0
- transformers/models/lasr/modeling_lasr.py +3 -2
- transformers/models/lasr/modular_lasr.py +8 -1
- transformers/models/lasr/processing_lasr.py +0 -2
- transformers/models/layoutlm/modeling_layoutlm.py +5 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +18 -0
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lilt/modeling_lilt.py +19 -15
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +8 -4
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
- transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
- transformers/models/longt5/modeling_longt5.py +0 -4
- transformers/models/m2m_100/modeling_m2m_100.py +10 -0
- transformers/models/mamba/modeling_mamba.py +2 -1
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +3 -0
- transformers/models/markuplm/modeling_markuplm.py +5 -8
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +9 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +9 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mimi/modeling_mimi.py +25 -4
- transformers/models/minimax/modeling_minimax.py +16 -3
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +12 -4
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +13 -2
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +4 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
- transformers/models/modernbert/modeling_modernbert.py +12 -1
- transformers/models/modernbert/modular_modernbert.py +12 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
- transformers/models/moonshine/modeling_moonshine.py +1 -1
- transformers/models/moshi/modeling_moshi.py +21 -51
- transformers/models/mpnet/modeling_mpnet.py +2 -0
- transformers/models/mra/modeling_mra.py +4 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +0 -10
- transformers/models/musicgen/modeling_musicgen.py +5 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +1 -1
- transformers/models/nemotron/modeling_nemotron.py +3 -3
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +11 -16
- transformers/models/nystromformer/modeling_nystromformer.py +7 -0
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/olmoe/modeling_olmoe.py +12 -4
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +7 -38
- transformers/models/openai/modeling_openai.py +12 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +7 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +7 -3
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/modeling_parakeet.py +5 -0
- transformers/models/parakeet/modular_parakeet.py +5 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
- transformers/models/patchtst/modeling_patchtst.py +5 -4
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/models/pe_audio/processing_pe_audio.py +24 -0
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +3 -0
- transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +5 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +12 -4
- transformers/models/phimoe/modular_phimoe.py +1 -1
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +1 -1
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +7 -0
- transformers/models/plbart/modular_plbart.py +6 -0
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +11 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prophetnet/modeling_prophetnet.py +2 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
- transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +7 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
- transformers/models/reformer/modeling_reformer.py +9 -1
- transformers/models/regnet/modeling_regnet.py +4 -0
- transformers/models/rembert/modeling_rembert.py +7 -1
- transformers/models/resnet/modeling_resnet.py +8 -3
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +4 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +1 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +5 -1
- transformers/models/sam2/modular_sam2.py +5 -1
- transformers/models/sam2_video/modeling_sam2_video.py +51 -43
- transformers/models/sam2_video/modular_sam2_video.py +31 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +23 -0
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +3 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +2 -2
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +63 -41
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
- transformers/models/speecht5/modeling_speecht5.py +28 -0
- transformers/models/splinter/modeling_splinter.py +9 -3
- transformers/models/squeezebert/modeling_squeezebert.py +2 -0
- transformers/models/stablelm/modeling_stablelm.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/swiftformer/modeling_swiftformer.py +4 -0
- transformers/models/swin/modeling_swin.py +16 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +49 -33
- transformers/models/swinv2/modeling_swinv2.py +41 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +1 -7
- transformers/models/t5gemma/modeling_t5gemma.py +1 -1
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +1 -1
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +12 -0
- transformers/models/timesfm/modular_timesfm.py +12 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
- transformers/models/trocr/modeling_trocr.py +1 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +4 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +3 -7
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +0 -6
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +7 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/visual_bert/modeling_visual_bert.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +4 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +5 -3
- transformers/models/x_clip/modeling_x_clip.py +2 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +10 -0
- transformers/models/xlm/modeling_xlm.py +13 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +4 -1
- transformers/models/zamba/modeling_zamba.py +2 -1
- transformers/models/zamba2/modeling_zamba2.py +3 -2
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +7 -0
- transformers/pipelines/__init__.py +9 -6
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/document_question_answering.py +1 -1
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +127 -56
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +9 -64
- transformers/quantizers/quantizer_aqlm.py +1 -18
- transformers/quantizers/quantizer_auto_round.py +1 -10
- transformers/quantizers/quantizer_awq.py +3 -8
- transformers/quantizers/quantizer_bitnet.py +1 -6
- transformers/quantizers/quantizer_bnb_4bit.py +9 -49
- transformers/quantizers/quantizer_bnb_8bit.py +9 -19
- transformers/quantizers/quantizer_compressed_tensors.py +1 -4
- transformers/quantizers/quantizer_eetq.py +2 -12
- transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
- transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
- transformers/quantizers/quantizer_fp_quant.py +4 -4
- transformers/quantizers/quantizer_gptq.py +1 -4
- transformers/quantizers/quantizer_higgs.py +2 -6
- transformers/quantizers/quantizer_mxfp4.py +2 -28
- transformers/quantizers/quantizer_quanto.py +14 -14
- transformers/quantizers/quantizer_spqr.py +3 -8
- transformers/quantizers/quantizer_torchao.py +28 -124
- transformers/quantizers/quantizer_vptq.py +1 -10
- transformers/testing_utils.py +28 -12
- transformers/tokenization_mistral_common.py +3 -2
- transformers/tokenization_utils_base.py +3 -2
- transformers/tokenization_utils_tokenizers.py +25 -2
- transformers/trainer.py +24 -2
- transformers/trainer_callback.py +8 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/training_args.py +8 -10
- transformers/utils/__init__.py +4 -0
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +34 -25
- transformers/utils/generic.py +20 -0
- transformers/utils/import_utils.py +51 -9
- transformers/utils/kernel_config.py +71 -18
- transformers/utils/quantization_config.py +8 -8
- transformers/video_processing_utils.py +16 -12
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -31,6 +31,7 @@ import torch.nn.functional as F
|
|
|
31
31
|
from torch import nn
|
|
32
32
|
from torch.nn import Parameter
|
|
33
33
|
|
|
34
|
+
from ... import initialization as init
|
|
34
35
|
from ...activations import ACT2FN
|
|
35
36
|
from ...cache_utils import Cache, DynamicCache
|
|
36
37
|
from ...generation import GenerationMixin
|
|
@@ -62,6 +63,52 @@ from .configuration_qwen2_5_omni import (
|
|
|
62
63
|
logger = logging.get_logger(__name__)
|
|
63
64
|
|
|
64
65
|
|
|
66
|
+
def kaiser_sinc_filter1d(cutoff, half_width, kernel_size):
|
|
67
|
+
"""Generates a 1D Kaiser-windowed sinc filter.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
cutoff (float): Normalized cutoff frequency (0 to 0.5).
|
|
71
|
+
half_width (float): Transition bandwidth.
|
|
72
|
+
kernel_size (int): Number of filter taps.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
torch.Tensor: A tensor of shape (1, 1, kernel_size) representing the filter.
|
|
76
|
+
"""
|
|
77
|
+
is_even = kernel_size % 2 == 0
|
|
78
|
+
half_size = kernel_size // 2
|
|
79
|
+
|
|
80
|
+
# Compute Kaiser window parameters
|
|
81
|
+
delta_f = 4 * half_width
|
|
82
|
+
attenuation = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
|
|
83
|
+
|
|
84
|
+
if attenuation > 50.0:
|
|
85
|
+
beta = 0.1102 * (attenuation - 8.7)
|
|
86
|
+
elif attenuation >= 21.0:
|
|
87
|
+
beta = 0.5842 * (attenuation - 21) ** 0.4 + 0.07886 * (attenuation - 21.0)
|
|
88
|
+
else:
|
|
89
|
+
beta = 0.0
|
|
90
|
+
|
|
91
|
+
kaiser_window = torch.kaiser_window(kernel_size, beta=beta, periodic=False, dtype=torch.float32)
|
|
92
|
+
|
|
93
|
+
# Compute time indices
|
|
94
|
+
if is_even:
|
|
95
|
+
time_indices = torch.arange(-half_size, half_size) + 0.5
|
|
96
|
+
else:
|
|
97
|
+
time_indices = torch.arange(kernel_size) - half_size
|
|
98
|
+
|
|
99
|
+
# Compute sinc filter
|
|
100
|
+
if cutoff == 0:
|
|
101
|
+
return torch.zeros((1, 1, kernel_size), dtype=torch.float32) # Ensures correct shape
|
|
102
|
+
|
|
103
|
+
sinc_filter = torch.sinc(2 * cutoff * time_indices)
|
|
104
|
+
normalized_filter = 2 * cutoff * kaiser_window * sinc_filter
|
|
105
|
+
|
|
106
|
+
# Normalize to ensure sum = 1 (avoid leakage of constant component)
|
|
107
|
+
normalized_filter /= normalized_filter.sum()
|
|
108
|
+
|
|
109
|
+
return normalized_filter.view(1, 1, kernel_size)
|
|
110
|
+
|
|
111
|
+
|
|
65
112
|
@auto_docstring
|
|
66
113
|
class Qwen2_5OmniPreTrainedModel(PreTrainedModel):
|
|
67
114
|
config: Qwen2_5OmniConfig
|
|
@@ -75,6 +122,23 @@ class Qwen2_5OmniPreTrainedModel(PreTrainedModel):
|
|
|
75
122
|
_can_compile_fullgraph = False
|
|
76
123
|
_supports_attention_backend = True
|
|
77
124
|
|
|
125
|
+
def _init_weights(self, module):
|
|
126
|
+
super()._init_weights(module)
|
|
127
|
+
if isinstance(module, SinusoidsPositionEmbedding):
|
|
128
|
+
log_timescale_increment = np.log(module.max_timescale) / (module.channels // 2 - 1)
|
|
129
|
+
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(module.channels // 2).float())
|
|
130
|
+
scaled_time = torch.arange(module.length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
|
|
131
|
+
init.copy_(module.positional_embedding, torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1))
|
|
132
|
+
elif isinstance(module, UpSample1d):
|
|
133
|
+
filter_tensor = kaiser_sinc_filter1d(0.5 / module.ratio, 0.6 / module.ratio, module.kernel_size)
|
|
134
|
+
init.copy_(module.filter, filter_tensor)
|
|
135
|
+
elif isinstance(module, DownSample1d):
|
|
136
|
+
filter_tensor = kaiser_sinc_filter1d(module.cutoff, module.half_width, module.kernel_size)
|
|
137
|
+
init.copy_(module.filter, filter_tensor)
|
|
138
|
+
elif isinstance(module, Qwen2_5_VisionRotaryEmbedding):
|
|
139
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
140
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
141
|
+
|
|
78
142
|
|
|
79
143
|
class Qwen2_5OmniPreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedModel):
|
|
80
144
|
input_modalities = ("image", "video", "audio", "text")
|
|
@@ -686,6 +750,9 @@ class Qwen2_5OmniAudioEncoderLayer(GradientCheckpointingLayer):
|
|
|
686
750
|
class SinusoidsPositionEmbedding(nn.Module):
|
|
687
751
|
def __init__(self, length, channels, max_timescale=10000):
|
|
688
752
|
super().__init__()
|
|
753
|
+
self.length = length
|
|
754
|
+
self.channels = channels
|
|
755
|
+
self.max_timescale = max_timescale
|
|
689
756
|
if channels % 2 != 0:
|
|
690
757
|
raise ValueError("SinusoidsPositionEmbedding needs even channels input")
|
|
691
758
|
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
|
|
@@ -1018,6 +1085,22 @@ class Qwen2_5OmniVisionBlock(GradientCheckpointingLayer):
|
|
|
1018
1085
|
return hidden_states
|
|
1019
1086
|
|
|
1020
1087
|
|
|
1088
|
+
class Qwen2_5_VisionRotaryEmbedding(nn.Module):
|
|
1089
|
+
inv_freq: torch.Tensor # fix linting for `register_buffer`
|
|
1090
|
+
|
|
1091
|
+
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
1092
|
+
super().__init__()
|
|
1093
|
+
self.dim = dim
|
|
1094
|
+
self.theta = theta
|
|
1095
|
+
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
1096
|
+
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
1097
|
+
|
|
1098
|
+
def forward(self, seqlen: int) -> torch.Tensor:
|
|
1099
|
+
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
|
|
1100
|
+
freqs = torch.outer(seq, self.inv_freq)
|
|
1101
|
+
return freqs
|
|
1102
|
+
|
|
1103
|
+
|
|
1021
1104
|
class Qwen2_5_VisionPatchEmbed(nn.Module):
|
|
1022
1105
|
def __init__(
|
|
1023
1106
|
self,
|
|
@@ -1044,20 +1127,6 @@ class Qwen2_5_VisionPatchEmbed(nn.Module):
|
|
|
1044
1127
|
return hidden_states
|
|
1045
1128
|
|
|
1046
1129
|
|
|
1047
|
-
class Qwen2_5_VisionRotaryEmbedding(nn.Module):
|
|
1048
|
-
inv_freq: torch.Tensor # fix linting for `register_buffer`
|
|
1049
|
-
|
|
1050
|
-
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
1051
|
-
super().__init__()
|
|
1052
|
-
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
1053
|
-
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
1054
|
-
|
|
1055
|
-
def forward(self, seqlen: int) -> torch.Tensor:
|
|
1056
|
-
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
|
|
1057
|
-
freqs = torch.outer(seq, self.inv_freq)
|
|
1058
|
-
return freqs
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
1130
|
class Qwen2_5OmniPatchMerger(nn.Module):
|
|
1062
1131
|
def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
|
|
1063
1132
|
super().__init__()
|
|
@@ -1105,6 +1174,8 @@ class Qwen2_5OmniVisionEncoder(Qwen2_5OmniPreTrainedModel):
|
|
|
1105
1174
|
)
|
|
1106
1175
|
self.gradient_checkpointing = False
|
|
1107
1176
|
|
|
1177
|
+
self.post_init()
|
|
1178
|
+
|
|
1108
1179
|
def rot_pos_emb(self, grid_thw):
|
|
1109
1180
|
pos_ids = []
|
|
1110
1181
|
for t, h, w in grid_thw:
|
|
@@ -1252,7 +1323,7 @@ class Qwen2_5OmniRotaryEmbedding(nn.Module):
|
|
|
1252
1323
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
1253
1324
|
|
|
1254
1325
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
1255
|
-
self.original_inv_freq =
|
|
1326
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
1256
1327
|
|
|
1257
1328
|
@staticmethod
|
|
1258
1329
|
def compute_default_rope_parameters(
|
|
@@ -2033,6 +2104,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
|
|
|
2033
2104
|
feature_attention_mask=None,
|
|
2034
2105
|
use_audio_in_video=False,
|
|
2035
2106
|
video_second_per_grid=None,
|
|
2107
|
+
is_first_iteration=False,
|
|
2036
2108
|
**kwargs,
|
|
2037
2109
|
):
|
|
2038
2110
|
model_inputs = super().prepare_inputs_for_generation(
|
|
@@ -2051,12 +2123,13 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
|
|
|
2051
2123
|
feature_attention_mask=feature_attention_mask,
|
|
2052
2124
|
use_audio_in_video=use_audio_in_video,
|
|
2053
2125
|
video_second_per_grid=video_second_per_grid,
|
|
2126
|
+
is_first_iteration=is_first_iteration,
|
|
2054
2127
|
**kwargs,
|
|
2055
2128
|
)
|
|
2056
2129
|
|
|
2057
2130
|
model_inputs["position_ids"] = None
|
|
2058
2131
|
|
|
2059
|
-
if
|
|
2132
|
+
if not is_first_iteration and use_cache:
|
|
2060
2133
|
model_inputs["pixel_values"] = None
|
|
2061
2134
|
model_inputs["pixel_values_videos"] = None
|
|
2062
2135
|
model_inputs["input_features"] = None
|
|
@@ -2386,7 +2459,11 @@ class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCon
|
|
|
2386
2459
|
self.rope_deltas = rope_deltas
|
|
2387
2460
|
|
|
2388
2461
|
else:
|
|
2389
|
-
|
|
2462
|
+
if inputs_embeds is not None:
|
|
2463
|
+
batch_size, seq_length, _ = inputs_embeds.shape
|
|
2464
|
+
else:
|
|
2465
|
+
batch_size, seq_length = input_ids.shape
|
|
2466
|
+
|
|
2390
2467
|
delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
|
|
2391
2468
|
position_ids = torch.arange(seq_length, device=input_ids.device)
|
|
2392
2469
|
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
|
@@ -2521,7 +2598,7 @@ class Qwen2_5OmniDiTRotaryEmbedding(nn.Module):
|
|
|
2521
2598
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
2522
2599
|
|
|
2523
2600
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
2524
|
-
self.original_inv_freq =
|
|
2601
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
2525
2602
|
|
|
2526
2603
|
@staticmethod
|
|
2527
2604
|
def compute_default_rope_parameters(
|
|
@@ -3184,52 +3261,6 @@ class SnakeBeta(nn.Module):
|
|
|
3184
3261
|
return hidden_states
|
|
3185
3262
|
|
|
3186
3263
|
|
|
3187
|
-
def kaiser_sinc_filter1d(cutoff, half_width, kernel_size):
|
|
3188
|
-
"""Generates a 1D Kaiser-windowed sinc filter.
|
|
3189
|
-
|
|
3190
|
-
Args:
|
|
3191
|
-
cutoff (float): Normalized cutoff frequency (0 to 0.5).
|
|
3192
|
-
half_width (float): Transition bandwidth.
|
|
3193
|
-
kernel_size (int): Number of filter taps.
|
|
3194
|
-
|
|
3195
|
-
Returns:
|
|
3196
|
-
torch.Tensor: A tensor of shape (1, 1, kernel_size) representing the filter.
|
|
3197
|
-
"""
|
|
3198
|
-
is_even = kernel_size % 2 == 0
|
|
3199
|
-
half_size = kernel_size // 2
|
|
3200
|
-
|
|
3201
|
-
# Compute Kaiser window parameters
|
|
3202
|
-
delta_f = 4 * half_width
|
|
3203
|
-
attenuation = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
|
|
3204
|
-
|
|
3205
|
-
if attenuation > 50.0:
|
|
3206
|
-
beta = 0.1102 * (attenuation - 8.7)
|
|
3207
|
-
elif attenuation >= 21.0:
|
|
3208
|
-
beta = 0.5842 * (attenuation - 21) ** 0.4 + 0.07886 * (attenuation - 21.0)
|
|
3209
|
-
else:
|
|
3210
|
-
beta = 0.0
|
|
3211
|
-
|
|
3212
|
-
kaiser_window = torch.kaiser_window(kernel_size, beta=beta, periodic=False, dtype=torch.float32)
|
|
3213
|
-
|
|
3214
|
-
# Compute time indices
|
|
3215
|
-
if is_even:
|
|
3216
|
-
time_indices = torch.arange(-half_size, half_size) + 0.5
|
|
3217
|
-
else:
|
|
3218
|
-
time_indices = torch.arange(kernel_size) - half_size
|
|
3219
|
-
|
|
3220
|
-
# Compute sinc filter
|
|
3221
|
-
if cutoff == 0:
|
|
3222
|
-
return torch.zeros((1, 1, kernel_size), dtype=torch.float32) # Ensures correct shape
|
|
3223
|
-
|
|
3224
|
-
sinc_filter = torch.sinc(2 * cutoff * time_indices)
|
|
3225
|
-
normalized_filter = 2 * cutoff * kaiser_window * sinc_filter
|
|
3226
|
-
|
|
3227
|
-
# Normalize to ensure sum = 1 (avoid leakage of constant component)
|
|
3228
|
-
normalized_filter /= normalized_filter.sum()
|
|
3229
|
-
|
|
3230
|
-
return normalized_filter.view(1, 1, kernel_size)
|
|
3231
|
-
|
|
3232
|
-
|
|
3233
3264
|
class UpSample1d(nn.Module):
|
|
3234
3265
|
def __init__(self, ratio=2, kernel_size=None):
|
|
3235
3266
|
super().__init__()
|
|
@@ -3260,6 +3291,9 @@ class DownSample1d(nn.Module):
|
|
|
3260
3291
|
super().__init__()
|
|
3261
3292
|
cutoff = 0.5 / ratio
|
|
3262
3293
|
half_width = 0.6 / ratio
|
|
3294
|
+
self.cutoff = cutoff
|
|
3295
|
+
self.half_width = half_width
|
|
3296
|
+
self.kernel_size = kernel_size
|
|
3263
3297
|
|
|
3264
3298
|
if cutoff < 0.0:
|
|
3265
3299
|
raise ValueError("Minimum cutoff must be larger than zero.")
|
|
@@ -3441,6 +3475,8 @@ class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3441
3475
|
config.upsample_initial_channel // (2**self.num_upsample_layers), 1, 7, 1, padding=3, bias=False
|
|
3442
3476
|
)
|
|
3443
3477
|
|
|
3478
|
+
self.post_init()
|
|
3479
|
+
|
|
3444
3480
|
def normalize_spectrogram(self, spectrogram, max_value, min_db):
|
|
3445
3481
|
return torch.clamp((2 * max_value) * ((spectrogram - min_db) / (-min_db)) - max_value, -max_value, max_value)
|
|
3446
3482
|
|
|
@@ -3568,6 +3604,8 @@ class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3568
3604
|
self.norm_out = Qwen2_5_OmniAdaLayerNormZero_Final(config.hidden_size) # final modulation
|
|
3569
3605
|
self.proj_out = nn.Linear(config.hidden_size, config.mel_dim)
|
|
3570
3606
|
|
|
3607
|
+
self.post_init()
|
|
3608
|
+
|
|
3571
3609
|
def _create_block_diff(self, hidden_states):
|
|
3572
3610
|
batch, seq_len = hidden_states.shape[0], hidden_states.shape[1]
|
|
3573
3611
|
block_indices = torch.arange(seq_len, device=hidden_states.device) // self.block_size # [seq_length]
|
|
@@ -3720,6 +3758,8 @@ class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3720
3758
|
config.bigvgan_config, attn_implementation=attn_impl
|
|
3721
3759
|
)
|
|
3722
3760
|
|
|
3761
|
+
self.post_init()
|
|
3762
|
+
|
|
3723
3763
|
def forward(
|
|
3724
3764
|
self,
|
|
3725
3765
|
code,
|
|
@@ -26,27 +26,13 @@ import torch.nn.functional as F
|
|
|
26
26
|
from torch import nn
|
|
27
27
|
from torch.nn import Parameter
|
|
28
28
|
|
|
29
|
-
from
|
|
30
|
-
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
|
|
31
|
-
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
|
|
32
|
-
Qwen2_5_VisionTransformerPretrainedModel,
|
|
33
|
-
Qwen2_5_VLAttention,
|
|
34
|
-
Qwen2_5_VLMLP,
|
|
35
|
-
Qwen2_5_VLPreTrainedModel,
|
|
36
|
-
Qwen2_5_VLTextModel,
|
|
37
|
-
Qwen2_5_VLVisionBlock,
|
|
38
|
-
eager_attention_forward,
|
|
39
|
-
)
|
|
40
|
-
from transformers.models.qwen2_audio.configuration_qwen2_audio import Qwen2AudioEncoderConfig
|
|
41
|
-
from transformers.models.qwen2_audio.modeling_qwen2_audio import Qwen2AudioEncoderLayer
|
|
42
|
-
from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLRotaryEmbedding
|
|
43
|
-
|
|
29
|
+
from ... import initialization as init
|
|
44
30
|
from ...cache_utils import Cache
|
|
45
31
|
from ...configuration_utils import PreTrainedConfig, layer_type_validation
|
|
46
32
|
from ...generation import GenerationMixin
|
|
47
33
|
from ...modeling_outputs import BaseModelOutput, ModelOutput
|
|
48
34
|
from ...modeling_rope_utils import RopeParameters
|
|
49
|
-
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
|
35
|
+
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
50
36
|
from ...processing_utils import Unpack
|
|
51
37
|
from ...utils import (
|
|
52
38
|
TransformersKwargs,
|
|
@@ -56,6 +42,21 @@ from ...utils import (
|
|
|
56
42
|
)
|
|
57
43
|
from ...utils.deprecation import deprecate_kwarg
|
|
58
44
|
from ...utils.hub import cached_file
|
|
45
|
+
from ..llama.modeling_llama import LlamaRotaryEmbedding, rotate_half
|
|
46
|
+
from ..qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
|
|
47
|
+
from ..qwen2_5_vl.modeling_qwen2_5_vl import (
|
|
48
|
+
Qwen2_5_VisionRotaryEmbedding,
|
|
49
|
+
Qwen2_5_VisionTransformerPretrainedModel,
|
|
50
|
+
Qwen2_5_VLAttention,
|
|
51
|
+
Qwen2_5_VLMLP,
|
|
52
|
+
Qwen2_5_VLPreTrainedModel,
|
|
53
|
+
Qwen2_5_VLTextModel,
|
|
54
|
+
Qwen2_5_VLVisionBlock,
|
|
55
|
+
eager_attention_forward,
|
|
56
|
+
)
|
|
57
|
+
from ..qwen2_audio.configuration_qwen2_audio import Qwen2AudioEncoderConfig
|
|
58
|
+
from ..qwen2_audio.modeling_qwen2_audio import Qwen2AudioEncoderLayer
|
|
59
|
+
from ..qwen2_vl.modeling_qwen2_vl import Qwen2VLRotaryEmbedding
|
|
59
60
|
|
|
60
61
|
|
|
61
62
|
logger = logging.get_logger(__name__)
|
|
@@ -1054,6 +1055,23 @@ class Qwen2_5OmniPreTrainedModel(Qwen2_5_VLPreTrainedModel):
|
|
|
1054
1055
|
input_modalities = ("image", "video", "audio", "text")
|
|
1055
1056
|
_can_compile_fullgraph = False
|
|
1056
1057
|
|
|
1058
|
+
def _init_weights(self, module):
|
|
1059
|
+
PreTrainedModel._init_weights(self, module)
|
|
1060
|
+
if isinstance(module, SinusoidsPositionEmbedding):
|
|
1061
|
+
log_timescale_increment = np.log(module.max_timescale) / (module.channels // 2 - 1)
|
|
1062
|
+
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(module.channels // 2).float())
|
|
1063
|
+
scaled_time = torch.arange(module.length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
|
|
1064
|
+
init.copy_(module.positional_embedding, torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1))
|
|
1065
|
+
elif isinstance(module, UpSample1d):
|
|
1066
|
+
filter_tensor = kaiser_sinc_filter1d(0.5 / module.ratio, 0.6 / module.ratio, module.kernel_size)
|
|
1067
|
+
init.copy_(module.filter, filter_tensor)
|
|
1068
|
+
elif isinstance(module, DownSample1d):
|
|
1069
|
+
filter_tensor = kaiser_sinc_filter1d(module.cutoff, module.half_width, module.kernel_size)
|
|
1070
|
+
init.copy_(module.filter, filter_tensor)
|
|
1071
|
+
elif isinstance(module, Qwen2_5_VisionRotaryEmbedding):
|
|
1072
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
1073
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
1074
|
+
|
|
1057
1075
|
|
|
1058
1076
|
class Qwen2_5OmniPreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedModel):
|
|
1059
1077
|
input_modalities = ("image", "video", "audio", "text")
|
|
@@ -1610,6 +1628,9 @@ class Qwen2_5OmniAudioEncoderLayer(Qwen2AudioEncoderLayer):
|
|
|
1610
1628
|
class SinusoidsPositionEmbedding(nn.Module):
|
|
1611
1629
|
def __init__(self, length, channels, max_timescale=10000):
|
|
1612
1630
|
super().__init__()
|
|
1631
|
+
self.length = length
|
|
1632
|
+
self.channels = channels
|
|
1633
|
+
self.max_timescale = max_timescale
|
|
1613
1634
|
if channels % 2 != 0:
|
|
1614
1635
|
raise ValueError("SinusoidsPositionEmbedding needs even channels input")
|
|
1615
1636
|
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
|
|
@@ -1918,6 +1939,10 @@ class Qwen2_5OmniVisionBlock(Qwen2_5_VLVisionBlock):
|
|
|
1918
1939
|
return hidden_states
|
|
1919
1940
|
|
|
1920
1941
|
|
|
1942
|
+
class Qwen2_5_VisionRotaryEmbedding(Qwen2_5_VisionRotaryEmbedding):
|
|
1943
|
+
pass
|
|
1944
|
+
|
|
1945
|
+
|
|
1921
1946
|
class Qwen2_5OmniVisionEncoder(Qwen2_5_VisionTransformerPretrainedModel):
|
|
1922
1947
|
config: Qwen2_5OmniVisionEncoderConfig
|
|
1923
1948
|
input_modalities = ("image", "video")
|
|
@@ -2382,6 +2407,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
|
|
|
2382
2407
|
feature_attention_mask=None,
|
|
2383
2408
|
use_audio_in_video=False,
|
|
2384
2409
|
video_second_per_grid=None,
|
|
2410
|
+
is_first_iteration=False,
|
|
2385
2411
|
**kwargs,
|
|
2386
2412
|
):
|
|
2387
2413
|
model_inputs = super().prepare_inputs_for_generation(
|
|
@@ -2400,12 +2426,13 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
|
|
|
2400
2426
|
feature_attention_mask=feature_attention_mask,
|
|
2401
2427
|
use_audio_in_video=use_audio_in_video,
|
|
2402
2428
|
video_second_per_grid=video_second_per_grid,
|
|
2429
|
+
is_first_iteration=is_first_iteration,
|
|
2403
2430
|
**kwargs,
|
|
2404
2431
|
)
|
|
2405
2432
|
|
|
2406
2433
|
model_inputs["position_ids"] = None
|
|
2407
2434
|
|
|
2408
|
-
if
|
|
2435
|
+
if not is_first_iteration and use_cache:
|
|
2409
2436
|
model_inputs["pixel_values"] = None
|
|
2410
2437
|
model_inputs["pixel_values_videos"] = None
|
|
2411
2438
|
model_inputs["input_features"] = None
|
|
@@ -2588,7 +2615,11 @@ class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCon
|
|
|
2588
2615
|
self.rope_deltas = rope_deltas
|
|
2589
2616
|
|
|
2590
2617
|
else:
|
|
2591
|
-
|
|
2618
|
+
if inputs_embeds is not None:
|
|
2619
|
+
batch_size, seq_length, _ = inputs_embeds.shape
|
|
2620
|
+
else:
|
|
2621
|
+
batch_size, seq_length = input_ids.shape
|
|
2622
|
+
|
|
2592
2623
|
delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
|
|
2593
2624
|
position_ids = torch.arange(seq_length, device=input_ids.device)
|
|
2594
2625
|
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
|
@@ -3419,6 +3450,9 @@ class DownSample1d(nn.Module):
|
|
|
3419
3450
|
super().__init__()
|
|
3420
3451
|
cutoff = 0.5 / ratio
|
|
3421
3452
|
half_width = 0.6 / ratio
|
|
3453
|
+
self.cutoff = cutoff
|
|
3454
|
+
self.half_width = half_width
|
|
3455
|
+
self.kernel_size = kernel_size
|
|
3422
3456
|
|
|
3423
3457
|
if cutoff < 0.0:
|
|
3424
3458
|
raise ValueError("Minimum cutoff must be larger than zero.")
|
|
@@ -3600,6 +3634,8 @@ class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3600
3634
|
config.upsample_initial_channel // (2**self.num_upsample_layers), 1, 7, 1, padding=3, bias=False
|
|
3601
3635
|
)
|
|
3602
3636
|
|
|
3637
|
+
self.post_init()
|
|
3638
|
+
|
|
3603
3639
|
def normalize_spectrogram(self, spectrogram, max_value, min_db):
|
|
3604
3640
|
return torch.clamp((2 * max_value) * ((spectrogram - min_db) / (-min_db)) - max_value, -max_value, max_value)
|
|
3605
3641
|
|
|
@@ -3727,6 +3763,8 @@ class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3727
3763
|
self.norm_out = Qwen2_5_OmniAdaLayerNormZero_Final(config.hidden_size) # final modulation
|
|
3728
3764
|
self.proj_out = nn.Linear(config.hidden_size, config.mel_dim)
|
|
3729
3765
|
|
|
3766
|
+
self.post_init()
|
|
3767
|
+
|
|
3730
3768
|
def _create_block_diff(self, hidden_states):
|
|
3731
3769
|
batch, seq_len = hidden_states.shape[0], hidden_states.shape[1]
|
|
3732
3770
|
block_indices = torch.arange(seq_len, device=hidden_states.device) // self.block_size # [seq_length]
|
|
@@ -3879,6 +3917,8 @@ class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3879
3917
|
config.bigvgan_config, attn_implementation=attn_impl
|
|
3880
3918
|
)
|
|
3881
3919
|
|
|
3920
|
+
self.post_init()
|
|
3921
|
+
|
|
3882
3922
|
def forward(
|
|
3883
3923
|
self,
|
|
3884
3924
|
code,
|
|
@@ -32,6 +32,7 @@ import torch
|
|
|
32
32
|
import torch.nn as nn
|
|
33
33
|
import torch.nn.functional as F
|
|
34
34
|
|
|
35
|
+
from ... import initialization as init
|
|
35
36
|
from ...activations import ACT2FN
|
|
36
37
|
from ...cache_utils import Cache, DynamicCache
|
|
37
38
|
from ...generation import GenerationMixin
|
|
@@ -96,6 +97,8 @@ class Qwen2_5_VisionRotaryEmbedding(nn.Module):
|
|
|
96
97
|
|
|
97
98
|
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
98
99
|
super().__init__()
|
|
100
|
+
self.dim = dim
|
|
101
|
+
self.theta = theta
|
|
99
102
|
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
100
103
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
101
104
|
|
|
@@ -217,8 +220,8 @@ class Qwen2_5_VLVisionAttention(nn.Module):
|
|
|
217
220
|
if self.config._attn_implementation != "eager":
|
|
218
221
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
219
222
|
|
|
220
|
-
if self.config._attn_implementation
|
|
221
|
-
# Flash Attention
|
|
223
|
+
if "flash" in self.config._attn_implementation:
|
|
224
|
+
# Flash Attention: Use cu_seqlens for variable length attention
|
|
222
225
|
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
|
|
223
226
|
attn_output, _ = attention_interface(
|
|
224
227
|
self,
|
|
@@ -304,6 +307,12 @@ class Qwen2_5_VLPreTrainedModel(PreTrainedModel):
|
|
|
304
307
|
_can_compile_fullgraph = True
|
|
305
308
|
_supports_attention_backend = True
|
|
306
309
|
|
|
310
|
+
def _init_weights(self, module):
|
|
311
|
+
super()._init_weights(module)
|
|
312
|
+
if isinstance(module, Qwen2_5_VisionRotaryEmbedding):
|
|
313
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
314
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
315
|
+
|
|
307
316
|
|
|
308
317
|
class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
|
|
309
318
|
config: Qwen2_5_VLVisionConfig
|
|
@@ -336,6 +345,8 @@ class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
|
|
|
336
345
|
)
|
|
337
346
|
self.gradient_checkpointing = False
|
|
338
347
|
|
|
348
|
+
self.post_init()
|
|
349
|
+
|
|
339
350
|
def rot_pos_emb(self, grid_thw):
|
|
340
351
|
pos_ids = []
|
|
341
352
|
for t, h, w in grid_thw:
|
|
@@ -508,7 +519,7 @@ class Qwen2_5_VLRotaryEmbedding(nn.Module):
|
|
|
508
519
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
509
520
|
|
|
510
521
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
511
|
-
self.original_inv_freq =
|
|
522
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
512
523
|
|
|
513
524
|
@staticmethod
|
|
514
525
|
def compute_default_rope_parameters(
|
|
@@ -1525,6 +1536,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
|
|
|
1525
1536
|
image_grid_thw=None,
|
|
1526
1537
|
video_grid_thw=None,
|
|
1527
1538
|
second_per_grid_ts=None,
|
|
1539
|
+
is_first_iteration=False,
|
|
1528
1540
|
**kwargs,
|
|
1529
1541
|
):
|
|
1530
1542
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1542,6 +1554,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
|
|
|
1542
1554
|
video_grid_thw=video_grid_thw,
|
|
1543
1555
|
second_per_grid_ts=second_per_grid_ts,
|
|
1544
1556
|
use_cache=use_cache,
|
|
1557
|
+
is_first_iteration=is_first_iteration,
|
|
1545
1558
|
**kwargs,
|
|
1546
1559
|
)
|
|
1547
1560
|
|
|
@@ -1551,7 +1564,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
|
|
|
1551
1564
|
# When compiling, we can't check tensor values thus we check only input length
|
|
1552
1565
|
# It is safe to assume that `length!=1` means we're in pre-fill because compiled
|
|
1553
1566
|
# models currently cannot do assisted decoding
|
|
1554
|
-
if cache_position[0] == 0 or self.model.rope_deltas is None:
|
|
1567
|
+
if (cache_position[0] == 0 or not use_cache) or self.model.rope_deltas is None:
|
|
1555
1568
|
vision_positions, rope_deltas = self.model.get_rope_index(
|
|
1556
1569
|
model_inputs.get("input_ids", None),
|
|
1557
1570
|
image_grid_thw=image_grid_thw,
|
|
@@ -1574,7 +1587,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
|
|
|
1574
1587
|
text_positions = model_inputs["position_ids"][None, ...]
|
|
1575
1588
|
model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0)
|
|
1576
1589
|
|
|
1577
|
-
if
|
|
1590
|
+
if not is_first_iteration and use_cache:
|
|
1578
1591
|
model_inputs["pixel_values"] = None
|
|
1579
1592
|
model_inputs["pixel_values_videos"] = None
|
|
1580
1593
|
|
|
@@ -26,8 +26,20 @@ import torch
|
|
|
26
26
|
import torch.nn as nn
|
|
27
27
|
import torch.nn.functional as F
|
|
28
28
|
|
|
29
|
-
from
|
|
30
|
-
from
|
|
29
|
+
from ... import initialization as init
|
|
30
|
+
from ...activations import ACT2FN
|
|
31
|
+
from ...cache_utils import Cache
|
|
32
|
+
from ...configuration_utils import PreTrainedConfig
|
|
33
|
+
from ...feature_extraction_utils import BatchFeature
|
|
34
|
+
from ...image_utils import ImageInput
|
|
35
|
+
from ...modeling_layers import GradientCheckpointingLayer
|
|
36
|
+
from ...modeling_utils import PreTrainedModel
|
|
37
|
+
from ...processing_utils import MultiModalData, ProcessingKwargs, Unpack
|
|
38
|
+
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
|
39
|
+
from ...utils import logging
|
|
40
|
+
from ...video_utils import VideoInput
|
|
41
|
+
from ..qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig, Qwen2VLTextConfig
|
|
42
|
+
from ..qwen2_vl.modeling_qwen2_vl import (
|
|
31
43
|
PatchEmbed,
|
|
32
44
|
PatchMerger,
|
|
33
45
|
Qwen2RMSNorm,
|
|
@@ -40,23 +52,7 @@ from transformers.models.qwen2_vl.modeling_qwen2_vl import (
|
|
|
40
52
|
VisionAttention,
|
|
41
53
|
VisionRotaryEmbedding,
|
|
42
54
|
)
|
|
43
|
-
from
|
|
44
|
-
|
|
45
|
-
from ...activations import ACT2FN
|
|
46
|
-
from ...cache_utils import Cache
|
|
47
|
-
from ...configuration_utils import PreTrainedConfig
|
|
48
|
-
from ...feature_extraction_utils import BatchFeature
|
|
49
|
-
from ...image_utils import ImageInput
|
|
50
|
-
from ...modeling_flash_attention_utils import is_flash_attn_available
|
|
51
|
-
from ...modeling_layers import GradientCheckpointingLayer
|
|
52
|
-
from ...processing_utils import MultiModalData, ProcessingKwargs, Unpack
|
|
53
|
-
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
|
54
|
-
from ...utils import logging
|
|
55
|
-
from ...video_utils import VideoInput
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
if is_flash_attn_available():
|
|
59
|
-
pass
|
|
55
|
+
from ..qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
|
|
60
56
|
|
|
61
57
|
|
|
62
58
|
logger = logging.get_logger(__name__)
|
|
@@ -173,7 +169,11 @@ class Qwen2_5_VLVisionBlock(GradientCheckpointingLayer):
|
|
|
173
169
|
|
|
174
170
|
|
|
175
171
|
class Qwen2_5_VLPreTrainedModel(Qwen2VLPreTrainedModel):
|
|
176
|
-
|
|
172
|
+
def _init_weights(self, module):
|
|
173
|
+
PreTrainedModel._init_weights(self, module)
|
|
174
|
+
if isinstance(module, Qwen2_5_VisionRotaryEmbedding):
|
|
175
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
176
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
177
177
|
|
|
178
178
|
|
|
179
179
|
class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
|
|
@@ -207,6 +207,8 @@ class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
|
|
|
207
207
|
)
|
|
208
208
|
self.gradient_checkpointing = False
|
|
209
209
|
|
|
210
|
+
self.post_init()
|
|
211
|
+
|
|
210
212
|
def rot_pos_emb(self, grid_thw):
|
|
211
213
|
pos_ids = []
|
|
212
214
|
for t, h, w in grid_thw:
|
|
@@ -776,6 +778,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
|
|
|
776
778
|
image_grid_thw=None,
|
|
777
779
|
video_grid_thw=None,
|
|
778
780
|
second_per_grid_ts=None,
|
|
781
|
+
is_first_iteration=False,
|
|
779
782
|
**kwargs,
|
|
780
783
|
):
|
|
781
784
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -793,6 +796,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
|
|
|
793
796
|
video_grid_thw=video_grid_thw,
|
|
794
797
|
second_per_grid_ts=second_per_grid_ts,
|
|
795
798
|
use_cache=use_cache,
|
|
799
|
+
is_first_iteration=is_first_iteration,
|
|
796
800
|
**kwargs,
|
|
797
801
|
)
|
|
798
802
|
|
|
@@ -802,7 +806,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
|
|
|
802
806
|
# When compiling, we can't check tensor values thus we check only input length
|
|
803
807
|
# It is safe to assume that `length!=1` means we're in pre-fill because compiled
|
|
804
808
|
# models currently cannot do assisted decoding
|
|
805
|
-
if cache_position[0] == 0 or self.model.rope_deltas is None:
|
|
809
|
+
if (cache_position[0] == 0 or not use_cache) or self.model.rope_deltas is None:
|
|
806
810
|
vision_positions, rope_deltas = self.model.get_rope_index(
|
|
807
811
|
model_inputs.get("input_ids", None),
|
|
808
812
|
image_grid_thw=image_grid_thw,
|
|
@@ -825,7 +829,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
|
|
|
825
829
|
text_positions = model_inputs["position_ids"][None, ...]
|
|
826
830
|
model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0)
|
|
827
831
|
|
|
828
|
-
if
|
|
832
|
+
if not is_first_iteration and use_cache:
|
|
829
833
|
model_inputs["pixel_values"] = None
|
|
830
834
|
model_inputs["pixel_values_videos"] = None
|
|
831
835
|
|
|
@@ -848,11 +848,11 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi
|
|
|
848
848
|
# Overwritten -- we should not pass input_features when we are in cached decoding stage
|
|
849
849
|
|
|
850
850
|
input_features = kwargs.pop("input_features", None)
|
|
851
|
-
|
|
851
|
+
is_first_iteration = kwargs.get("is_first_iteration", False)
|
|
852
852
|
|
|
853
853
|
model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
|
|
854
854
|
|
|
855
|
-
if
|
|
855
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
856
856
|
# input_features should only be passed when we are not in cached decoding stage
|
|
857
857
|
model_inputs["input_features"] = input_features
|
|
858
858
|
|