transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -393,6 +393,7 @@ class PromptDepthAnythingForDepthEstimation(PromptDepthAnythingPreTrainedModel):
|
|
|
393
393
|
output_attentions: Optional[bool] = None,
|
|
394
394
|
output_hidden_states: Optional[bool] = None,
|
|
395
395
|
return_dict: Optional[bool] = None,
|
|
396
|
+
**kwargs,
|
|
396
397
|
) -> Union[tuple[torch.Tensor], DepthEstimatorOutput]:
|
|
397
398
|
r"""
|
|
398
399
|
prompt_depth (`torch.FloatTensor` of shape `(batch_size, 1, height, width)`, *optional*):
|
|
@@ -236,6 +236,7 @@ class PromptDepthAnythingForDepthEstimation(DepthAnythingForDepthEstimation):
|
|
|
236
236
|
output_attentions: Optional[bool] = None,
|
|
237
237
|
output_hidden_states: Optional[bool] = None,
|
|
238
238
|
return_dict: Optional[bool] = None,
|
|
239
|
+
**kwargs,
|
|
239
240
|
) -> Union[tuple[torch.Tensor], DepthEstimatorOutput]:
|
|
240
241
|
r"""
|
|
241
242
|
prompt_depth (`torch.FloatTensor` of shape `(batch_size, 1, height, width)`, *optional*):
|
|
@@ -993,6 +993,7 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
|
|
|
993
993
|
output_attentions: Optional[bool] = None,
|
|
994
994
|
output_hidden_states: Optional[bool] = None,
|
|
995
995
|
return_dict: Optional[bool] = None,
|
|
996
|
+
**kwargs,
|
|
996
997
|
) -> Union[tuple, BaseModelOutput]:
|
|
997
998
|
r"""
|
|
998
999
|
Example:
|
|
@@ -1113,6 +1114,7 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
|
|
|
1113
1114
|
output_hidden_states: Optional[bool] = None,
|
|
1114
1115
|
return_dict: Optional[bool] = None,
|
|
1115
1116
|
cache_position: Optional[torch.Tensor] = None,
|
|
1117
|
+
**kwargs,
|
|
1116
1118
|
) -> Union[tuple, ProphetNetDecoderModelOutput]:
|
|
1117
1119
|
r"""
|
|
1118
1120
|
Example:
|
|
@@ -1416,6 +1418,7 @@ class ProphetNetModel(ProphetNetPreTrainedModel):
|
|
|
1416
1418
|
output_hidden_states: Optional[bool] = None,
|
|
1417
1419
|
return_dict: Optional[bool] = None,
|
|
1418
1420
|
cache_position: Optional[torch.Tensor] = None,
|
|
1421
|
+
**kwargs,
|
|
1419
1422
|
) -> Union[tuple, ProphetNetSeq2SeqModelOutput]:
|
|
1420
1423
|
r"""
|
|
1421
1424
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1845,6 +1848,7 @@ class ProphetNetForCausalLM(ProphetNetPreTrainedModel, GenerationMixin):
|
|
|
1845
1848
|
past_key_values=None,
|
|
1846
1849
|
attention_mask=None,
|
|
1847
1850
|
use_cache=None,
|
|
1851
|
+
is_first_iteration=False,
|
|
1848
1852
|
**kwargs,
|
|
1849
1853
|
):
|
|
1850
1854
|
# Overwritten -- our tests complain if we use GenerationMixin.prepare_inputs_for_generation
|
|
@@ -1853,7 +1857,7 @@ class ProphetNetForCausalLM(ProphetNetPreTrainedModel, GenerationMixin):
|
|
|
1853
1857
|
if attention_mask is None:
|
|
1854
1858
|
attention_mask = input_ids.new_ones(input_ids.shape)
|
|
1855
1859
|
|
|
1856
|
-
if past_key_values is not None and
|
|
1860
|
+
if past_key_values is not None and not is_first_iteration:
|
|
1857
1861
|
input_ids = input_ids[:, -1:]
|
|
1858
1862
|
# first step, decoder_cached_states are empty
|
|
1859
1863
|
model_inputs = {
|
|
@@ -458,6 +458,7 @@ class PvtModel(PvtPreTrainedModel):
|
|
|
458
458
|
output_attentions: Optional[bool] = None,
|
|
459
459
|
output_hidden_states: Optional[bool] = None,
|
|
460
460
|
return_dict: Optional[bool] = None,
|
|
461
|
+
**kwargs,
|
|
461
462
|
) -> Union[tuple, BaseModelOutput]:
|
|
462
463
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
463
464
|
output_hidden_states = (
|
|
@@ -512,6 +513,7 @@ class PvtForImageClassification(PvtPreTrainedModel):
|
|
|
512
513
|
output_attentions: Optional[bool] = None,
|
|
513
514
|
output_hidden_states: Optional[bool] = None,
|
|
514
515
|
return_dict: Optional[bool] = None,
|
|
516
|
+
**kwargs,
|
|
515
517
|
) -> Union[tuple, ImageClassifierOutput]:
|
|
516
518
|
r"""
|
|
517
519
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -406,6 +406,7 @@ class PvtV2Model(PvtV2PreTrainedModel):
|
|
|
406
406
|
output_attentions: Optional[bool] = None,
|
|
407
407
|
output_hidden_states: Optional[bool] = None,
|
|
408
408
|
return_dict: Optional[bool] = None,
|
|
409
|
+
**kwargs,
|
|
409
410
|
) -> Union[tuple, BaseModelOutput]:
|
|
410
411
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
411
412
|
output_hidden_states = (
|
|
@@ -460,6 +461,7 @@ class PvtV2ForImageClassification(PvtV2PreTrainedModel):
|
|
|
460
461
|
output_attentions: Optional[bool] = None,
|
|
461
462
|
output_hidden_states: Optional[bool] = None,
|
|
462
463
|
return_dict: Optional[bool] = None,
|
|
464
|
+
**kwargs,
|
|
463
465
|
) -> Union[tuple, ImageClassifierOutput]:
|
|
464
466
|
r"""
|
|
465
467
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -523,6 +525,7 @@ class PvtV2Backbone(PvtV2Model, BackboneMixin):
|
|
|
523
525
|
output_attentions: Optional[bool] = None,
|
|
524
526
|
output_hidden_states: Optional[bool] = None,
|
|
525
527
|
return_dict: Optional[bool] = None,
|
|
528
|
+
**kwargs,
|
|
526
529
|
) -> BackboneOutput:
|
|
527
530
|
r"""
|
|
528
531
|
Examples:
|
|
@@ -13,7 +13,7 @@ from torch import nn
|
|
|
13
13
|
from ...activations import ACT2FN
|
|
14
14
|
from ...cache_utils import Cache, DynamicCache
|
|
15
15
|
from ...generation import GenerationMixin
|
|
16
|
-
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
|
|
16
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
17
17
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
18
18
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
19
19
|
from ...modeling_layers import (
|
|
@@ -27,7 +27,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
27
27
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
28
28
|
from ...processing_utils import Unpack
|
|
29
29
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
30
|
-
from ...utils.generic import check_model_inputs
|
|
30
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
31
31
|
from .configuration_qwen2 import Qwen2Config
|
|
32
32
|
|
|
33
33
|
|
|
@@ -64,7 +64,7 @@ class Qwen2RotaryEmbedding(nn.Module):
|
|
|
64
64
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
65
65
|
|
|
66
66
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
67
|
-
self.original_inv_freq =
|
|
67
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
68
68
|
|
|
69
69
|
@staticmethod
|
|
70
70
|
def compute_default_rope_parameters(
|
|
@@ -103,7 +103,7 @@ class Qwen2RotaryEmbedding(nn.Module):
|
|
|
103
103
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
104
104
|
|
|
105
105
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
106
|
-
with
|
|
106
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
107
107
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
108
108
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
109
109
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -185,6 +185,7 @@ def eager_attention_forward(
|
|
|
185
185
|
return attn_output, attn_weights
|
|
186
186
|
|
|
187
187
|
|
|
188
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
188
189
|
class Qwen2Attention(nn.Module):
|
|
189
190
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
190
191
|
|
|
@@ -202,7 +203,6 @@ class Qwen2Attention(nn.Module):
|
|
|
202
203
|
self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
|
|
203
204
|
self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
|
|
204
205
|
self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
|
|
205
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
206
206
|
self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None
|
|
207
207
|
|
|
208
208
|
def forward(
|
|
@@ -14,10 +14,11 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
"""Tokenization classes for Qwen2."""
|
|
16
16
|
|
|
17
|
+
from typing import Optional, Union
|
|
18
|
+
|
|
17
19
|
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers
|
|
18
20
|
from tokenizers.models import BPE
|
|
19
21
|
|
|
20
|
-
from ...tokenization_utils_base import generate_merges
|
|
21
22
|
from ...tokenization_utils_tokenizers import TokenizersBackend
|
|
22
23
|
from ...utils import logging
|
|
23
24
|
|
|
@@ -38,33 +39,30 @@ PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p
|
|
|
38
39
|
class Qwen2Tokenizer(TokenizersBackend):
|
|
39
40
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
40
41
|
model_input_names = ["input_ids", "attention_mask"]
|
|
41
|
-
|
|
42
|
+
model = BPE
|
|
42
43
|
|
|
43
44
|
def __init__(
|
|
44
45
|
self,
|
|
46
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
47
|
+
merges: Optional[Union[str, list[str]]] = None,
|
|
45
48
|
vocab_file=None,
|
|
46
49
|
merges_file=None,
|
|
47
|
-
unk_token="<|endoftext|>",
|
|
50
|
+
unk_token: str = "<|endoftext|>",
|
|
48
51
|
bos_token=None,
|
|
49
|
-
eos_token="<|endoftext|>",
|
|
50
|
-
pad_token="<|endoftext|>",
|
|
52
|
+
eos_token: str = "<|endoftext|>",
|
|
53
|
+
pad_token: str = "<|endoftext|>",
|
|
51
54
|
add_prefix_space=None,
|
|
52
|
-
vocab=None,
|
|
53
|
-
merges=None,
|
|
54
55
|
**kwargs,
|
|
55
56
|
):
|
|
56
57
|
self.add_prefix_space = add_prefix_space if add_prefix_space is not None else False
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
)
|
|
62
|
-
else:
|
|
63
|
-
self._vocab = {
|
|
58
|
+
self._vocab = (
|
|
59
|
+
vocab
|
|
60
|
+
if vocab is not None
|
|
61
|
+
else {
|
|
64
62
|
"<|endoftext|>": 0,
|
|
65
63
|
}
|
|
66
|
-
|
|
67
|
-
|
|
64
|
+
)
|
|
65
|
+
self._merges = merges or []
|
|
68
66
|
self._tokenizer = Tokenizer(
|
|
69
67
|
BPE(
|
|
70
68
|
vocab=self._vocab,
|
|
@@ -92,12 +90,10 @@ class Qwen2Tokenizer(TokenizersBackend):
|
|
|
92
90
|
),
|
|
93
91
|
]
|
|
94
92
|
)
|
|
95
|
-
tokenizer_object = self._tokenizer
|
|
96
93
|
|
|
97
94
|
super().__init__(
|
|
98
95
|
vocab_file=vocab_file,
|
|
99
96
|
merges_file=merges_file,
|
|
100
|
-
tokenizer_object=tokenizer_object,
|
|
101
97
|
unk_token=unk_token,
|
|
102
98
|
bos_token=bos_token,
|
|
103
99
|
eos_token=eos_token,
|
|
@@ -365,7 +365,7 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig):
|
|
|
365
365
|
self.rope_parameters = rope_parameters
|
|
366
366
|
super().__init__(
|
|
367
367
|
tie_word_embeddings=tie_word_embeddings,
|
|
368
|
-
ignore_keys_at_rope_validation={"
|
|
368
|
+
ignore_keys_at_rope_validation={"mrope_section"},
|
|
369
369
|
**kwargs,
|
|
370
370
|
)
|
|
371
371
|
|
|
@@ -713,7 +713,9 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig):
|
|
|
713
713
|
layer_type_validation(self.layer_types, self.num_hidden_layers)
|
|
714
714
|
|
|
715
715
|
self.rope_parameters = rope_parameters
|
|
716
|
-
super().__init__(
|
|
716
|
+
super().__init__(
|
|
717
|
+
tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
|
|
718
|
+
)
|
|
717
719
|
|
|
718
720
|
|
|
719
721
|
class Qwen2_5OmniDiTConfig(PreTrainedConfig):
|
|
@@ -31,6 +31,7 @@ import torch.nn.functional as F
|
|
|
31
31
|
from torch import nn
|
|
32
32
|
from torch.nn import Parameter
|
|
33
33
|
|
|
34
|
+
from ... import initialization as init
|
|
34
35
|
from ...activations import ACT2FN
|
|
35
36
|
from ...cache_utils import Cache, DynamicCache
|
|
36
37
|
from ...generation import GenerationMixin
|
|
@@ -43,6 +44,7 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
|
43
44
|
from ...processing_utils import Unpack
|
|
44
45
|
from ...utils import TransformersKwargs, auto_docstring, check_torch_load_is_safe, logging
|
|
45
46
|
from ...utils.deprecation import deprecate_kwarg
|
|
47
|
+
from ...utils.generic import maybe_autocast
|
|
46
48
|
from ...utils.hub import cached_file
|
|
47
49
|
from ..qwen2.modeling_qwen2 import Qwen2RMSNorm
|
|
48
50
|
from .configuration_qwen2_5_omni import (
|
|
@@ -61,6 +63,52 @@ from .configuration_qwen2_5_omni import (
|
|
|
61
63
|
logger = logging.get_logger(__name__)
|
|
62
64
|
|
|
63
65
|
|
|
66
|
+
def kaiser_sinc_filter1d(cutoff, half_width, kernel_size):
|
|
67
|
+
"""Generates a 1D Kaiser-windowed sinc filter.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
cutoff (float): Normalized cutoff frequency (0 to 0.5).
|
|
71
|
+
half_width (float): Transition bandwidth.
|
|
72
|
+
kernel_size (int): Number of filter taps.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
torch.Tensor: A tensor of shape (1, 1, kernel_size) representing the filter.
|
|
76
|
+
"""
|
|
77
|
+
is_even = kernel_size % 2 == 0
|
|
78
|
+
half_size = kernel_size // 2
|
|
79
|
+
|
|
80
|
+
# Compute Kaiser window parameters
|
|
81
|
+
delta_f = 4 * half_width
|
|
82
|
+
attenuation = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
|
|
83
|
+
|
|
84
|
+
if attenuation > 50.0:
|
|
85
|
+
beta = 0.1102 * (attenuation - 8.7)
|
|
86
|
+
elif attenuation >= 21.0:
|
|
87
|
+
beta = 0.5842 * (attenuation - 21) ** 0.4 + 0.07886 * (attenuation - 21.0)
|
|
88
|
+
else:
|
|
89
|
+
beta = 0.0
|
|
90
|
+
|
|
91
|
+
kaiser_window = torch.kaiser_window(kernel_size, beta=beta, periodic=False, dtype=torch.float32)
|
|
92
|
+
|
|
93
|
+
# Compute time indices
|
|
94
|
+
if is_even:
|
|
95
|
+
time_indices = torch.arange(-half_size, half_size) + 0.5
|
|
96
|
+
else:
|
|
97
|
+
time_indices = torch.arange(kernel_size) - half_size
|
|
98
|
+
|
|
99
|
+
# Compute sinc filter
|
|
100
|
+
if cutoff == 0:
|
|
101
|
+
return torch.zeros((1, 1, kernel_size), dtype=torch.float32) # Ensures correct shape
|
|
102
|
+
|
|
103
|
+
sinc_filter = torch.sinc(2 * cutoff * time_indices)
|
|
104
|
+
normalized_filter = 2 * cutoff * kaiser_window * sinc_filter
|
|
105
|
+
|
|
106
|
+
# Normalize to ensure sum = 1 (avoid leakage of constant component)
|
|
107
|
+
normalized_filter /= normalized_filter.sum()
|
|
108
|
+
|
|
109
|
+
return normalized_filter.view(1, 1, kernel_size)
|
|
110
|
+
|
|
111
|
+
|
|
64
112
|
@auto_docstring
|
|
65
113
|
class Qwen2_5OmniPreTrainedModel(PreTrainedModel):
|
|
66
114
|
config: Qwen2_5OmniConfig
|
|
@@ -74,6 +122,23 @@ class Qwen2_5OmniPreTrainedModel(PreTrainedModel):
|
|
|
74
122
|
_can_compile_fullgraph = False
|
|
75
123
|
_supports_attention_backend = True
|
|
76
124
|
|
|
125
|
+
def _init_weights(self, module):
|
|
126
|
+
super()._init_weights(module)
|
|
127
|
+
if isinstance(module, SinusoidsPositionEmbedding):
|
|
128
|
+
log_timescale_increment = np.log(module.max_timescale) / (module.channels // 2 - 1)
|
|
129
|
+
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(module.channels // 2).float())
|
|
130
|
+
scaled_time = torch.arange(module.length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
|
|
131
|
+
init.copy_(module.positional_embedding, torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1))
|
|
132
|
+
elif isinstance(module, UpSample1d):
|
|
133
|
+
filter_tensor = kaiser_sinc_filter1d(0.5 / module.ratio, 0.6 / module.ratio, module.kernel_size)
|
|
134
|
+
init.copy_(module.filter, filter_tensor)
|
|
135
|
+
elif isinstance(module, DownSample1d):
|
|
136
|
+
filter_tensor = kaiser_sinc_filter1d(module.cutoff, module.half_width, module.kernel_size)
|
|
137
|
+
init.copy_(module.filter, filter_tensor)
|
|
138
|
+
elif isinstance(module, Qwen2_5_VisionRotaryEmbedding):
|
|
139
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
140
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
141
|
+
|
|
77
142
|
|
|
78
143
|
class Qwen2_5OmniPreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedModel):
|
|
79
144
|
input_modalities = ("image", "video", "audio", "text")
|
|
@@ -685,6 +750,9 @@ class Qwen2_5OmniAudioEncoderLayer(GradientCheckpointingLayer):
|
|
|
685
750
|
class SinusoidsPositionEmbedding(nn.Module):
|
|
686
751
|
def __init__(self, length, channels, max_timescale=10000):
|
|
687
752
|
super().__init__()
|
|
753
|
+
self.length = length
|
|
754
|
+
self.channels = channels
|
|
755
|
+
self.max_timescale = max_timescale
|
|
688
756
|
if channels % 2 != 0:
|
|
689
757
|
raise ValueError("SinusoidsPositionEmbedding needs even channels input")
|
|
690
758
|
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
|
|
@@ -1017,6 +1085,22 @@ class Qwen2_5OmniVisionBlock(GradientCheckpointingLayer):
|
|
|
1017
1085
|
return hidden_states
|
|
1018
1086
|
|
|
1019
1087
|
|
|
1088
|
+
class Qwen2_5_VisionRotaryEmbedding(nn.Module):
|
|
1089
|
+
inv_freq: torch.Tensor # fix linting for `register_buffer`
|
|
1090
|
+
|
|
1091
|
+
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
1092
|
+
super().__init__()
|
|
1093
|
+
self.dim = dim
|
|
1094
|
+
self.theta = theta
|
|
1095
|
+
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
1096
|
+
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
1097
|
+
|
|
1098
|
+
def forward(self, seqlen: int) -> torch.Tensor:
|
|
1099
|
+
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
|
|
1100
|
+
freqs = torch.outer(seq, self.inv_freq)
|
|
1101
|
+
return freqs
|
|
1102
|
+
|
|
1103
|
+
|
|
1020
1104
|
class Qwen2_5_VisionPatchEmbed(nn.Module):
|
|
1021
1105
|
def __init__(
|
|
1022
1106
|
self,
|
|
@@ -1043,20 +1127,6 @@ class Qwen2_5_VisionPatchEmbed(nn.Module):
|
|
|
1043
1127
|
return hidden_states
|
|
1044
1128
|
|
|
1045
1129
|
|
|
1046
|
-
class Qwen2_5_VisionRotaryEmbedding(nn.Module):
|
|
1047
|
-
inv_freq: torch.Tensor # fix linting for `register_buffer`
|
|
1048
|
-
|
|
1049
|
-
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
1050
|
-
super().__init__()
|
|
1051
|
-
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
1052
|
-
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
1053
|
-
|
|
1054
|
-
def forward(self, seqlen: int) -> torch.Tensor:
|
|
1055
|
-
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
|
|
1056
|
-
freqs = torch.outer(seq, self.inv_freq)
|
|
1057
|
-
return freqs
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
1130
|
class Qwen2_5OmniPatchMerger(nn.Module):
|
|
1061
1131
|
def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
|
|
1062
1132
|
super().__init__()
|
|
@@ -1104,6 +1174,8 @@ class Qwen2_5OmniVisionEncoder(Qwen2_5OmniPreTrainedModel):
|
|
|
1104
1174
|
)
|
|
1105
1175
|
self.gradient_checkpointing = False
|
|
1106
1176
|
|
|
1177
|
+
self.post_init()
|
|
1178
|
+
|
|
1107
1179
|
def rot_pos_emb(self, grid_thw):
|
|
1108
1180
|
pos_ids = []
|
|
1109
1181
|
for t, h, w in grid_thw:
|
|
@@ -1251,7 +1323,7 @@ class Qwen2_5OmniRotaryEmbedding(nn.Module):
|
|
|
1251
1323
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
1252
1324
|
|
|
1253
1325
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
1254
|
-
self.original_inv_freq =
|
|
1326
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
1255
1327
|
|
|
1256
1328
|
@staticmethod
|
|
1257
1329
|
def compute_default_rope_parameters(
|
|
@@ -1291,7 +1363,7 @@ class Qwen2_5OmniRotaryEmbedding(nn.Module):
|
|
|
1291
1363
|
position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
|
|
1292
1364
|
|
|
1293
1365
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
1294
|
-
with
|
|
1366
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
1295
1367
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
|
|
1296
1368
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
1297
1369
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -1958,11 +2030,8 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
|
|
|
1958
2030
|
audio_feature_lengths = None
|
|
1959
2031
|
|
|
1960
2032
|
if attention_mask is not None and position_ids is None:
|
|
1961
|
-
if (
|
|
1962
|
-
|
|
1963
|
-
or (cache_position is not None and cache_position[0] == 0)
|
|
1964
|
-
or self.rope_deltas is None
|
|
1965
|
-
):
|
|
2033
|
+
past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
|
|
2034
|
+
if past_key_values_length == 0 or self.rope_deltas is None:
|
|
1966
2035
|
delta0 = (1 - attention_mask).sum(dim=-1).unsqueeze(1)
|
|
1967
2036
|
position_ids, rope_deltas = self.get_rope_index(
|
|
1968
2037
|
input_ids,
|
|
@@ -1977,7 +2046,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
|
|
|
1977
2046
|
self.rope_deltas = rope_deltas
|
|
1978
2047
|
else:
|
|
1979
2048
|
batch_size, seq_length = input_ids.shape
|
|
1980
|
-
delta =
|
|
2049
|
+
delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
|
|
1981
2050
|
position_ids = torch.arange(seq_length, device=input_ids.device)
|
|
1982
2051
|
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
|
1983
2052
|
position_ids = position_ids.add(delta)
|
|
@@ -2035,6 +2104,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
|
|
|
2035
2104
|
feature_attention_mask=None,
|
|
2036
2105
|
use_audio_in_video=False,
|
|
2037
2106
|
video_second_per_grid=None,
|
|
2107
|
+
is_first_iteration=False,
|
|
2038
2108
|
**kwargs,
|
|
2039
2109
|
):
|
|
2040
2110
|
model_inputs = super().prepare_inputs_for_generation(
|
|
@@ -2053,12 +2123,13 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
|
|
|
2053
2123
|
feature_attention_mask=feature_attention_mask,
|
|
2054
2124
|
use_audio_in_video=use_audio_in_video,
|
|
2055
2125
|
video_second_per_grid=video_second_per_grid,
|
|
2126
|
+
is_first_iteration=is_first_iteration,
|
|
2056
2127
|
**kwargs,
|
|
2057
2128
|
)
|
|
2058
2129
|
|
|
2059
2130
|
model_inputs["position_ids"] = None
|
|
2060
2131
|
|
|
2061
|
-
if
|
|
2132
|
+
if not is_first_iteration and use_cache:
|
|
2062
2133
|
model_inputs["pixel_values"] = None
|
|
2063
2134
|
model_inputs["pixel_values_videos"] = None
|
|
2064
2135
|
model_inputs["input_features"] = None
|
|
@@ -2317,6 +2388,7 @@ class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCon
|
|
|
2317
2388
|
output_attentions: Optional[bool] = None,
|
|
2318
2389
|
output_hidden_states: Optional[bool] = None,
|
|
2319
2390
|
return_dict: Optional[bool] = None,
|
|
2391
|
+
**kwargs,
|
|
2320
2392
|
) -> Union[tuple, Qwen2_5OmniTalkerCausalLMOutputWithPast]:
|
|
2321
2393
|
r"""
|
|
2322
2394
|
thinker_reply_part (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
|
@@ -2366,11 +2438,8 @@ class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCon
|
|
|
2366
2438
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
2367
2439
|
|
|
2368
2440
|
if attention_mask is not None and position_ids is None:
|
|
2369
|
-
if (
|
|
2370
|
-
|
|
2371
|
-
or (cache_position is not None and cache_position[0] == 0)
|
|
2372
|
-
or self.rope_deltas is None
|
|
2373
|
-
):
|
|
2441
|
+
past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
|
|
2442
|
+
if past_key_values_length == 0 or self.rope_deltas is None:
|
|
2374
2443
|
position_ids, rope_deltas = self.get_rope_index(
|
|
2375
2444
|
input_text_ids,
|
|
2376
2445
|
image_grid_thw,
|
|
@@ -2390,8 +2459,12 @@ class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCon
|
|
|
2390
2459
|
self.rope_deltas = rope_deltas
|
|
2391
2460
|
|
|
2392
2461
|
else:
|
|
2393
|
-
|
|
2394
|
-
|
|
2462
|
+
if inputs_embeds is not None:
|
|
2463
|
+
batch_size, seq_length, _ = inputs_embeds.shape
|
|
2464
|
+
else:
|
|
2465
|
+
batch_size, seq_length = input_ids.shape
|
|
2466
|
+
|
|
2467
|
+
delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
|
|
2395
2468
|
position_ids = torch.arange(seq_length, device=input_ids.device)
|
|
2396
2469
|
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
|
2397
2470
|
position_ids = position_ids.add(delta)
|
|
@@ -2525,7 +2598,7 @@ class Qwen2_5OmniDiTRotaryEmbedding(nn.Module):
|
|
|
2525
2598
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
2526
2599
|
|
|
2527
2600
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
2528
|
-
self.original_inv_freq =
|
|
2601
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
2529
2602
|
|
|
2530
2603
|
@staticmethod
|
|
2531
2604
|
def compute_default_rope_parameters(
|
|
@@ -2564,7 +2637,7 @@ class Qwen2_5OmniDiTRotaryEmbedding(nn.Module):
|
|
|
2564
2637
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
2565
2638
|
|
|
2566
2639
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
2567
|
-
with
|
|
2640
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
2568
2641
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
2569
2642
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
2570
2643
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -3188,52 +3261,6 @@ class SnakeBeta(nn.Module):
|
|
|
3188
3261
|
return hidden_states
|
|
3189
3262
|
|
|
3190
3263
|
|
|
3191
|
-
def kaiser_sinc_filter1d(cutoff, half_width, kernel_size):
|
|
3192
|
-
"""Generates a 1D Kaiser-windowed sinc filter.
|
|
3193
|
-
|
|
3194
|
-
Args:
|
|
3195
|
-
cutoff (float): Normalized cutoff frequency (0 to 0.5).
|
|
3196
|
-
half_width (float): Transition bandwidth.
|
|
3197
|
-
kernel_size (int): Number of filter taps.
|
|
3198
|
-
|
|
3199
|
-
Returns:
|
|
3200
|
-
torch.Tensor: A tensor of shape (1, 1, kernel_size) representing the filter.
|
|
3201
|
-
"""
|
|
3202
|
-
is_even = kernel_size % 2 == 0
|
|
3203
|
-
half_size = kernel_size // 2
|
|
3204
|
-
|
|
3205
|
-
# Compute Kaiser window parameters
|
|
3206
|
-
delta_f = 4 * half_width
|
|
3207
|
-
attenuation = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
|
|
3208
|
-
|
|
3209
|
-
if attenuation > 50.0:
|
|
3210
|
-
beta = 0.1102 * (attenuation - 8.7)
|
|
3211
|
-
elif attenuation >= 21.0:
|
|
3212
|
-
beta = 0.5842 * (attenuation - 21) ** 0.4 + 0.07886 * (attenuation - 21.0)
|
|
3213
|
-
else:
|
|
3214
|
-
beta = 0.0
|
|
3215
|
-
|
|
3216
|
-
kaiser_window = torch.kaiser_window(kernel_size, beta=beta, periodic=False, dtype=torch.float32)
|
|
3217
|
-
|
|
3218
|
-
# Compute time indices
|
|
3219
|
-
if is_even:
|
|
3220
|
-
time_indices = torch.arange(-half_size, half_size) + 0.5
|
|
3221
|
-
else:
|
|
3222
|
-
time_indices = torch.arange(kernel_size) - half_size
|
|
3223
|
-
|
|
3224
|
-
# Compute sinc filter
|
|
3225
|
-
if cutoff == 0:
|
|
3226
|
-
return torch.zeros((1, 1, kernel_size), dtype=torch.float32) # Ensures correct shape
|
|
3227
|
-
|
|
3228
|
-
sinc_filter = torch.sinc(2 * cutoff * time_indices)
|
|
3229
|
-
normalized_filter = 2 * cutoff * kaiser_window * sinc_filter
|
|
3230
|
-
|
|
3231
|
-
# Normalize to ensure sum = 1 (avoid leakage of constant component)
|
|
3232
|
-
normalized_filter /= normalized_filter.sum()
|
|
3233
|
-
|
|
3234
|
-
return normalized_filter.view(1, 1, kernel_size)
|
|
3235
|
-
|
|
3236
|
-
|
|
3237
3264
|
class UpSample1d(nn.Module):
|
|
3238
3265
|
def __init__(self, ratio=2, kernel_size=None):
|
|
3239
3266
|
super().__init__()
|
|
@@ -3264,6 +3291,9 @@ class DownSample1d(nn.Module):
|
|
|
3264
3291
|
super().__init__()
|
|
3265
3292
|
cutoff = 0.5 / ratio
|
|
3266
3293
|
half_width = 0.6 / ratio
|
|
3294
|
+
self.cutoff = cutoff
|
|
3295
|
+
self.half_width = half_width
|
|
3296
|
+
self.kernel_size = kernel_size
|
|
3267
3297
|
|
|
3268
3298
|
if cutoff < 0.0:
|
|
3269
3299
|
raise ValueError("Minimum cutoff must be larger than zero.")
|
|
@@ -3445,6 +3475,8 @@ class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3445
3475
|
config.upsample_initial_channel // (2**self.num_upsample_layers), 1, 7, 1, padding=3, bias=False
|
|
3446
3476
|
)
|
|
3447
3477
|
|
|
3478
|
+
self.post_init()
|
|
3479
|
+
|
|
3448
3480
|
def normalize_spectrogram(self, spectrogram, max_value, min_db):
|
|
3449
3481
|
return torch.clamp((2 * max_value) * ((spectrogram - min_db) / (-min_db)) - max_value, -max_value, max_value)
|
|
3450
3482
|
|
|
@@ -3459,7 +3491,7 @@ class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3459
3491
|
decibel_spectrum = self.amplitude_to_db(amplitude_spectrum, -115) - 20
|
|
3460
3492
|
return self.normalize_spectrogram(decibel_spectrum, 1, -115)
|
|
3461
3493
|
|
|
3462
|
-
def forward(self, mel_spectrogram):
|
|
3494
|
+
def forward(self, mel_spectrogram, **kwargs):
|
|
3463
3495
|
processed_spectrogram = self.process_mel_spectrogram(mel_spectrogram)
|
|
3464
3496
|
hidden_representation = self.conv_pre(processed_spectrogram)
|
|
3465
3497
|
|
|
@@ -3572,6 +3604,8 @@ class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3572
3604
|
self.norm_out = Qwen2_5_OmniAdaLayerNormZero_Final(config.hidden_size) # final modulation
|
|
3573
3605
|
self.proj_out = nn.Linear(config.hidden_size, config.mel_dim)
|
|
3574
3606
|
|
|
3607
|
+
self.post_init()
|
|
3608
|
+
|
|
3575
3609
|
def _create_block_diff(self, hidden_states):
|
|
3576
3610
|
batch, seq_len = hidden_states.shape[0], hidden_states.shape[1]
|
|
3577
3611
|
block_indices = torch.arange(seq_len, device=hidden_states.device) // self.block_size # [seq_length]
|
|
@@ -3592,6 +3626,7 @@ class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3592
3626
|
drop_audio_conditioning=False,
|
|
3593
3627
|
drop_code=False,
|
|
3594
3628
|
apply_cfg=True,
|
|
3629
|
+
**kwargs,
|
|
3595
3630
|
):
|
|
3596
3631
|
batch_size = hidden_states.shape[0]
|
|
3597
3632
|
if time_step.ndim == 0:
|
|
@@ -3723,6 +3758,8 @@ class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3723
3758
|
config.bigvgan_config, attn_implementation=attn_impl
|
|
3724
3759
|
)
|
|
3725
3760
|
|
|
3761
|
+
self.post_init()
|
|
3762
|
+
|
|
3726
3763
|
def forward(
|
|
3727
3764
|
self,
|
|
3728
3765
|
code,
|