transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -26,27 +26,13 @@ import torch.nn.functional as F
|
|
|
26
26
|
from torch import nn
|
|
27
27
|
from torch.nn import Parameter
|
|
28
28
|
|
|
29
|
-
from
|
|
30
|
-
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
|
|
31
|
-
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
|
|
32
|
-
Qwen2_5_VisionTransformerPretrainedModel,
|
|
33
|
-
Qwen2_5_VLAttention,
|
|
34
|
-
Qwen2_5_VLMLP,
|
|
35
|
-
Qwen2_5_VLPreTrainedModel,
|
|
36
|
-
Qwen2_5_VLTextModel,
|
|
37
|
-
Qwen2_5_VLVisionBlock,
|
|
38
|
-
eager_attention_forward,
|
|
39
|
-
)
|
|
40
|
-
from transformers.models.qwen2_audio.configuration_qwen2_audio import Qwen2AudioEncoderConfig
|
|
41
|
-
from transformers.models.qwen2_audio.modeling_qwen2_audio import Qwen2AudioEncoderLayer
|
|
42
|
-
from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLRotaryEmbedding
|
|
43
|
-
|
|
29
|
+
from ... import initialization as init
|
|
44
30
|
from ...cache_utils import Cache
|
|
45
31
|
from ...configuration_utils import PreTrainedConfig, layer_type_validation
|
|
46
32
|
from ...generation import GenerationMixin
|
|
47
33
|
from ...modeling_outputs import BaseModelOutput, ModelOutput
|
|
48
34
|
from ...modeling_rope_utils import RopeParameters
|
|
49
|
-
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
|
35
|
+
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
50
36
|
from ...processing_utils import Unpack
|
|
51
37
|
from ...utils import (
|
|
52
38
|
TransformersKwargs,
|
|
@@ -56,6 +42,21 @@ from ...utils import (
|
|
|
56
42
|
)
|
|
57
43
|
from ...utils.deprecation import deprecate_kwarg
|
|
58
44
|
from ...utils.hub import cached_file
|
|
45
|
+
from ..llama.modeling_llama import LlamaRotaryEmbedding, rotate_half
|
|
46
|
+
from ..qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
|
|
47
|
+
from ..qwen2_5_vl.modeling_qwen2_5_vl import (
|
|
48
|
+
Qwen2_5_VisionRotaryEmbedding,
|
|
49
|
+
Qwen2_5_VisionTransformerPretrainedModel,
|
|
50
|
+
Qwen2_5_VLAttention,
|
|
51
|
+
Qwen2_5_VLMLP,
|
|
52
|
+
Qwen2_5_VLPreTrainedModel,
|
|
53
|
+
Qwen2_5_VLTextModel,
|
|
54
|
+
Qwen2_5_VLVisionBlock,
|
|
55
|
+
eager_attention_forward,
|
|
56
|
+
)
|
|
57
|
+
from ..qwen2_audio.configuration_qwen2_audio import Qwen2AudioEncoderConfig
|
|
58
|
+
from ..qwen2_audio.modeling_qwen2_audio import Qwen2AudioEncoderLayer
|
|
59
|
+
from ..qwen2_vl.modeling_qwen2_vl import Qwen2VLRotaryEmbedding
|
|
59
60
|
|
|
60
61
|
|
|
61
62
|
logger = logging.get_logger(__name__)
|
|
@@ -399,7 +400,7 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig):
|
|
|
399
400
|
self.rope_parameters = rope_parameters
|
|
400
401
|
super().__init__(
|
|
401
402
|
tie_word_embeddings=tie_word_embeddings,
|
|
402
|
-
ignore_keys_at_rope_validation={"
|
|
403
|
+
ignore_keys_at_rope_validation={"mrope_section"},
|
|
403
404
|
**kwargs,
|
|
404
405
|
)
|
|
405
406
|
|
|
@@ -747,7 +748,9 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig):
|
|
|
747
748
|
layer_type_validation(self.layer_types, self.num_hidden_layers)
|
|
748
749
|
|
|
749
750
|
self.rope_parameters = rope_parameters
|
|
750
|
-
super().__init__(
|
|
751
|
+
super().__init__(
|
|
752
|
+
tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
|
|
753
|
+
)
|
|
751
754
|
|
|
752
755
|
|
|
753
756
|
class Qwen2_5OmniDiTConfig(PreTrainedConfig):
|
|
@@ -1052,6 +1055,23 @@ class Qwen2_5OmniPreTrainedModel(Qwen2_5_VLPreTrainedModel):
|
|
|
1052
1055
|
input_modalities = ("image", "video", "audio", "text")
|
|
1053
1056
|
_can_compile_fullgraph = False
|
|
1054
1057
|
|
|
1058
|
+
def _init_weights(self, module):
|
|
1059
|
+
PreTrainedModel._init_weights(self, module)
|
|
1060
|
+
if isinstance(module, SinusoidsPositionEmbedding):
|
|
1061
|
+
log_timescale_increment = np.log(module.max_timescale) / (module.channels // 2 - 1)
|
|
1062
|
+
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(module.channels // 2).float())
|
|
1063
|
+
scaled_time = torch.arange(module.length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
|
|
1064
|
+
init.copy_(module.positional_embedding, torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1))
|
|
1065
|
+
elif isinstance(module, UpSample1d):
|
|
1066
|
+
filter_tensor = kaiser_sinc_filter1d(0.5 / module.ratio, 0.6 / module.ratio, module.kernel_size)
|
|
1067
|
+
init.copy_(module.filter, filter_tensor)
|
|
1068
|
+
elif isinstance(module, DownSample1d):
|
|
1069
|
+
filter_tensor = kaiser_sinc_filter1d(module.cutoff, module.half_width, module.kernel_size)
|
|
1070
|
+
init.copy_(module.filter, filter_tensor)
|
|
1071
|
+
elif isinstance(module, Qwen2_5_VisionRotaryEmbedding):
|
|
1072
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
1073
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
1074
|
+
|
|
1055
1075
|
|
|
1056
1076
|
class Qwen2_5OmniPreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedModel):
|
|
1057
1077
|
input_modalities = ("image", "video", "audio", "text")
|
|
@@ -1608,6 +1628,9 @@ class Qwen2_5OmniAudioEncoderLayer(Qwen2AudioEncoderLayer):
|
|
|
1608
1628
|
class SinusoidsPositionEmbedding(nn.Module):
|
|
1609
1629
|
def __init__(self, length, channels, max_timescale=10000):
|
|
1610
1630
|
super().__init__()
|
|
1631
|
+
self.length = length
|
|
1632
|
+
self.channels = channels
|
|
1633
|
+
self.max_timescale = max_timescale
|
|
1611
1634
|
if channels % 2 != 0:
|
|
1612
1635
|
raise ValueError("SinusoidsPositionEmbedding needs even channels input")
|
|
1613
1636
|
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
|
|
@@ -1916,6 +1939,10 @@ class Qwen2_5OmniVisionBlock(Qwen2_5_VLVisionBlock):
|
|
|
1916
1939
|
return hidden_states
|
|
1917
1940
|
|
|
1918
1941
|
|
|
1942
|
+
class Qwen2_5_VisionRotaryEmbedding(Qwen2_5_VisionRotaryEmbedding):
|
|
1943
|
+
pass
|
|
1944
|
+
|
|
1945
|
+
|
|
1919
1946
|
class Qwen2_5OmniVisionEncoder(Qwen2_5_VisionTransformerPretrainedModel):
|
|
1920
1947
|
config: Qwen2_5OmniVisionEncoderConfig
|
|
1921
1948
|
input_modalities = ("image", "video")
|
|
@@ -2306,11 +2333,8 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
|
|
|
2306
2333
|
audio_feature_lengths = None
|
|
2307
2334
|
|
|
2308
2335
|
if attention_mask is not None and position_ids is None:
|
|
2309
|
-
if (
|
|
2310
|
-
|
|
2311
|
-
or (cache_position is not None and cache_position[0] == 0)
|
|
2312
|
-
or self.rope_deltas is None
|
|
2313
|
-
):
|
|
2336
|
+
past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
|
|
2337
|
+
if past_key_values_length == 0 or self.rope_deltas is None:
|
|
2314
2338
|
delta0 = (1 - attention_mask).sum(dim=-1).unsqueeze(1)
|
|
2315
2339
|
position_ids, rope_deltas = self.get_rope_index(
|
|
2316
2340
|
input_ids,
|
|
@@ -2325,7 +2349,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
|
|
|
2325
2349
|
self.rope_deltas = rope_deltas
|
|
2326
2350
|
else:
|
|
2327
2351
|
batch_size, seq_length = input_ids.shape
|
|
2328
|
-
delta =
|
|
2352
|
+
delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
|
|
2329
2353
|
position_ids = torch.arange(seq_length, device=input_ids.device)
|
|
2330
2354
|
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
|
2331
2355
|
position_ids = position_ids.add(delta)
|
|
@@ -2383,6 +2407,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
|
|
|
2383
2407
|
feature_attention_mask=None,
|
|
2384
2408
|
use_audio_in_video=False,
|
|
2385
2409
|
video_second_per_grid=None,
|
|
2410
|
+
is_first_iteration=False,
|
|
2386
2411
|
**kwargs,
|
|
2387
2412
|
):
|
|
2388
2413
|
model_inputs = super().prepare_inputs_for_generation(
|
|
@@ -2401,12 +2426,13 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
|
|
|
2401
2426
|
feature_attention_mask=feature_attention_mask,
|
|
2402
2427
|
use_audio_in_video=use_audio_in_video,
|
|
2403
2428
|
video_second_per_grid=video_second_per_grid,
|
|
2429
|
+
is_first_iteration=is_first_iteration,
|
|
2404
2430
|
**kwargs,
|
|
2405
2431
|
)
|
|
2406
2432
|
|
|
2407
2433
|
model_inputs["position_ids"] = None
|
|
2408
2434
|
|
|
2409
|
-
if
|
|
2435
|
+
if not is_first_iteration and use_cache:
|
|
2410
2436
|
model_inputs["pixel_values"] = None
|
|
2411
2437
|
model_inputs["pixel_values_videos"] = None
|
|
2412
2438
|
model_inputs["input_features"] = None
|
|
@@ -2518,6 +2544,7 @@ class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCon
|
|
|
2518
2544
|
output_attentions: Optional[bool] = None,
|
|
2519
2545
|
output_hidden_states: Optional[bool] = None,
|
|
2520
2546
|
return_dict: Optional[bool] = None,
|
|
2547
|
+
**kwargs,
|
|
2521
2548
|
) -> Union[tuple, Qwen2_5OmniTalkerCausalLMOutputWithPast]:
|
|
2522
2549
|
r"""
|
|
2523
2550
|
thinker_reply_part (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
|
@@ -2567,11 +2594,8 @@ class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCon
|
|
|
2567
2594
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
2568
2595
|
|
|
2569
2596
|
if attention_mask is not None and position_ids is None:
|
|
2570
|
-
if (
|
|
2571
|
-
|
|
2572
|
-
or (cache_position is not None and cache_position[0] == 0)
|
|
2573
|
-
or self.rope_deltas is None
|
|
2574
|
-
):
|
|
2597
|
+
past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
|
|
2598
|
+
if past_key_values_length == 0 or self.rope_deltas is None:
|
|
2575
2599
|
position_ids, rope_deltas = self.get_rope_index(
|
|
2576
2600
|
input_text_ids,
|
|
2577
2601
|
image_grid_thw,
|
|
@@ -2591,8 +2615,12 @@ class Qwen2_5OmniTalkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCon
|
|
|
2591
2615
|
self.rope_deltas = rope_deltas
|
|
2592
2616
|
|
|
2593
2617
|
else:
|
|
2594
|
-
|
|
2595
|
-
|
|
2618
|
+
if inputs_embeds is not None:
|
|
2619
|
+
batch_size, seq_length, _ = inputs_embeds.shape
|
|
2620
|
+
else:
|
|
2621
|
+
batch_size, seq_length = input_ids.shape
|
|
2622
|
+
|
|
2623
|
+
delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
|
|
2596
2624
|
position_ids = torch.arange(seq_length, device=input_ids.device)
|
|
2597
2625
|
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
|
2598
2626
|
position_ids = position_ids.add(delta)
|
|
@@ -3422,6 +3450,9 @@ class DownSample1d(nn.Module):
|
|
|
3422
3450
|
super().__init__()
|
|
3423
3451
|
cutoff = 0.5 / ratio
|
|
3424
3452
|
half_width = 0.6 / ratio
|
|
3453
|
+
self.cutoff = cutoff
|
|
3454
|
+
self.half_width = half_width
|
|
3455
|
+
self.kernel_size = kernel_size
|
|
3425
3456
|
|
|
3426
3457
|
if cutoff < 0.0:
|
|
3427
3458
|
raise ValueError("Minimum cutoff must be larger than zero.")
|
|
@@ -3603,6 +3634,8 @@ class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3603
3634
|
config.upsample_initial_channel // (2**self.num_upsample_layers), 1, 7, 1, padding=3, bias=False
|
|
3604
3635
|
)
|
|
3605
3636
|
|
|
3637
|
+
self.post_init()
|
|
3638
|
+
|
|
3606
3639
|
def normalize_spectrogram(self, spectrogram, max_value, min_db):
|
|
3607
3640
|
return torch.clamp((2 * max_value) * ((spectrogram - min_db) / (-min_db)) - max_value, -max_value, max_value)
|
|
3608
3641
|
|
|
@@ -3617,7 +3650,7 @@ class Qwen2_5OmniToken2WavBigVGANModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3617
3650
|
decibel_spectrum = self.amplitude_to_db(amplitude_spectrum, -115) - 20
|
|
3618
3651
|
return self.normalize_spectrogram(decibel_spectrum, 1, -115)
|
|
3619
3652
|
|
|
3620
|
-
def forward(self, mel_spectrogram):
|
|
3653
|
+
def forward(self, mel_spectrogram, **kwargs):
|
|
3621
3654
|
processed_spectrogram = self.process_mel_spectrogram(mel_spectrogram)
|
|
3622
3655
|
hidden_representation = self.conv_pre(processed_spectrogram)
|
|
3623
3656
|
|
|
@@ -3730,6 +3763,8 @@ class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3730
3763
|
self.norm_out = Qwen2_5_OmniAdaLayerNormZero_Final(config.hidden_size) # final modulation
|
|
3731
3764
|
self.proj_out = nn.Linear(config.hidden_size, config.mel_dim)
|
|
3732
3765
|
|
|
3766
|
+
self.post_init()
|
|
3767
|
+
|
|
3733
3768
|
def _create_block_diff(self, hidden_states):
|
|
3734
3769
|
batch, seq_len = hidden_states.shape[0], hidden_states.shape[1]
|
|
3735
3770
|
block_indices = torch.arange(seq_len, device=hidden_states.device) // self.block_size # [seq_length]
|
|
@@ -3750,6 +3785,7 @@ class Qwen2_5OmniToken2WavDiTModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3750
3785
|
drop_audio_conditioning=False,
|
|
3751
3786
|
drop_code=False,
|
|
3752
3787
|
apply_cfg=True,
|
|
3788
|
+
**kwargs,
|
|
3753
3789
|
):
|
|
3754
3790
|
batch_size = hidden_states.shape[0]
|
|
3755
3791
|
if time_step.ndim == 0:
|
|
@@ -3881,6 +3917,8 @@ class Qwen2_5OmniToken2WavModel(Qwen2_5OmniPreTrainedModel):
|
|
|
3881
3917
|
config.bigvgan_config, attn_implementation=attn_impl
|
|
3882
3918
|
)
|
|
3883
3919
|
|
|
3920
|
+
self.post_init()
|
|
3921
|
+
|
|
3884
3922
|
def forward(
|
|
3885
3923
|
self,
|
|
3886
3924
|
code,
|
|
@@ -230,7 +230,7 @@ class Qwen2_5_VLTextConfig(PreTrainedConfig):
|
|
|
230
230
|
bos_token_id=bos_token_id,
|
|
231
231
|
eos_token_id=eos_token_id,
|
|
232
232
|
pad_token_id=pad_token_id,
|
|
233
|
-
ignore_keys_at_rope_validation={"
|
|
233
|
+
ignore_keys_at_rope_validation={"mrope_section"},
|
|
234
234
|
**kwargs,
|
|
235
235
|
)
|
|
236
236
|
|
|
@@ -32,6 +32,7 @@ import torch
|
|
|
32
32
|
import torch.nn as nn
|
|
33
33
|
import torch.nn.functional as F
|
|
34
34
|
|
|
35
|
+
from ... import initialization as init
|
|
35
36
|
from ...activations import ACT2FN
|
|
36
37
|
from ...cache_utils import Cache, DynamicCache
|
|
37
38
|
from ...generation import GenerationMixin
|
|
@@ -43,6 +44,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
|
|
|
43
44
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
44
45
|
from ...processing_utils import Unpack
|
|
45
46
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
|
|
47
|
+
from ...utils.generic import maybe_autocast
|
|
46
48
|
from ..qwen2.modeling_qwen2 import Qwen2RMSNorm
|
|
47
49
|
from .configuration_qwen2_5_vl import Qwen2_5_VLConfig, Qwen2_5_VLTextConfig, Qwen2_5_VLVisionConfig
|
|
48
50
|
|
|
@@ -95,6 +97,8 @@ class Qwen2_5_VisionRotaryEmbedding(nn.Module):
|
|
|
95
97
|
|
|
96
98
|
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
97
99
|
super().__init__()
|
|
100
|
+
self.dim = dim
|
|
101
|
+
self.theta = theta
|
|
98
102
|
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
99
103
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
100
104
|
|
|
@@ -216,8 +220,8 @@ class Qwen2_5_VLVisionAttention(nn.Module):
|
|
|
216
220
|
if self.config._attn_implementation != "eager":
|
|
217
221
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
218
222
|
|
|
219
|
-
if self.config._attn_implementation
|
|
220
|
-
# Flash Attention
|
|
223
|
+
if "flash" in self.config._attn_implementation:
|
|
224
|
+
# Flash Attention: Use cu_seqlens for variable length attention
|
|
221
225
|
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
|
|
222
226
|
attn_output, _ = attention_interface(
|
|
223
227
|
self,
|
|
@@ -303,6 +307,12 @@ class Qwen2_5_VLPreTrainedModel(PreTrainedModel):
|
|
|
303
307
|
_can_compile_fullgraph = True
|
|
304
308
|
_supports_attention_backend = True
|
|
305
309
|
|
|
310
|
+
def _init_weights(self, module):
|
|
311
|
+
super()._init_weights(module)
|
|
312
|
+
if isinstance(module, Qwen2_5_VisionRotaryEmbedding):
|
|
313
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
314
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
315
|
+
|
|
306
316
|
|
|
307
317
|
class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
|
|
308
318
|
config: Qwen2_5_VLVisionConfig
|
|
@@ -335,6 +345,8 @@ class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
|
|
|
335
345
|
)
|
|
336
346
|
self.gradient_checkpointing = False
|
|
337
347
|
|
|
348
|
+
self.post_init()
|
|
349
|
+
|
|
338
350
|
def rot_pos_emb(self, grid_thw):
|
|
339
351
|
pos_ids = []
|
|
340
352
|
for t, h, w in grid_thw:
|
|
@@ -507,7 +519,7 @@ class Qwen2_5_VLRotaryEmbedding(nn.Module):
|
|
|
507
519
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
508
520
|
|
|
509
521
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
510
|
-
self.original_inv_freq =
|
|
522
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
511
523
|
|
|
512
524
|
@staticmethod
|
|
513
525
|
def compute_default_rope_parameters(
|
|
@@ -547,7 +559,7 @@ class Qwen2_5_VLRotaryEmbedding(nn.Module):
|
|
|
547
559
|
position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
|
|
548
560
|
|
|
549
561
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
550
|
-
with
|
|
562
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
551
563
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
|
|
552
564
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
553
565
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -1290,7 +1302,8 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
|
|
|
1290
1302
|
inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
|
|
1291
1303
|
|
|
1292
1304
|
if position_ids is None:
|
|
1293
|
-
|
|
1305
|
+
past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
|
|
1306
|
+
if self.rope_deltas is None or past_key_values_length == 0:
|
|
1294
1307
|
position_ids, rope_deltas = self.get_rope_index(
|
|
1295
1308
|
input_ids,
|
|
1296
1309
|
image_grid_thw,
|
|
@@ -1303,10 +1316,7 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
|
|
|
1303
1316
|
batch_size, seq_length, _ = inputs_embeds.shape
|
|
1304
1317
|
position_ids = torch.arange(seq_length, device=inputs_embeds.device)
|
|
1305
1318
|
position_ids = position_ids.view(1, 1, -1).expand(3, batch_size, -1)
|
|
1306
|
-
|
|
1307
|
-
delta = (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
|
|
1308
|
-
else:
|
|
1309
|
-
delta = torch.zeros((batch_size, seq_length), device=inputs_embeds.device)
|
|
1319
|
+
delta = (past_key_values_length + self.rope_deltas).to(inputs_embeds.device)
|
|
1310
1320
|
delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=1)
|
|
1311
1321
|
position_ids = position_ids + delta.to(position_ids.device)
|
|
1312
1322
|
|
|
@@ -1526,6 +1536,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
|
|
|
1526
1536
|
image_grid_thw=None,
|
|
1527
1537
|
video_grid_thw=None,
|
|
1528
1538
|
second_per_grid_ts=None,
|
|
1539
|
+
is_first_iteration=False,
|
|
1529
1540
|
**kwargs,
|
|
1530
1541
|
):
|
|
1531
1542
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1543,6 +1554,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
|
|
|
1543
1554
|
video_grid_thw=video_grid_thw,
|
|
1544
1555
|
second_per_grid_ts=second_per_grid_ts,
|
|
1545
1556
|
use_cache=use_cache,
|
|
1557
|
+
is_first_iteration=is_first_iteration,
|
|
1546
1558
|
**kwargs,
|
|
1547
1559
|
)
|
|
1548
1560
|
|
|
@@ -1552,7 +1564,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
|
|
|
1552
1564
|
# When compiling, we can't check tensor values thus we check only input length
|
|
1553
1565
|
# It is safe to assume that `length!=1` means we're in pre-fill because compiled
|
|
1554
1566
|
# models currently cannot do assisted decoding
|
|
1555
|
-
if cache_position[0] == 0 or self.model.rope_deltas is None:
|
|
1567
|
+
if (cache_position[0] == 0 or not use_cache) or self.model.rope_deltas is None:
|
|
1556
1568
|
vision_positions, rope_deltas = self.model.get_rope_index(
|
|
1557
1569
|
model_inputs.get("input_ids", None),
|
|
1558
1570
|
image_grid_thw=image_grid_thw,
|
|
@@ -1575,7 +1587,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
|
|
|
1575
1587
|
text_positions = model_inputs["position_ids"][None, ...]
|
|
1576
1588
|
model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0)
|
|
1577
1589
|
|
|
1578
|
-
if
|
|
1590
|
+
if not is_first_iteration and use_cache:
|
|
1579
1591
|
model_inputs["pixel_values"] = None
|
|
1580
1592
|
model_inputs["pixel_values_videos"] = None
|
|
1581
1593
|
|
|
@@ -26,8 +26,20 @@ import torch
|
|
|
26
26
|
import torch.nn as nn
|
|
27
27
|
import torch.nn.functional as F
|
|
28
28
|
|
|
29
|
-
from
|
|
30
|
-
from
|
|
29
|
+
from ... import initialization as init
|
|
30
|
+
from ...activations import ACT2FN
|
|
31
|
+
from ...cache_utils import Cache
|
|
32
|
+
from ...configuration_utils import PreTrainedConfig
|
|
33
|
+
from ...feature_extraction_utils import BatchFeature
|
|
34
|
+
from ...image_utils import ImageInput
|
|
35
|
+
from ...modeling_layers import GradientCheckpointingLayer
|
|
36
|
+
from ...modeling_utils import PreTrainedModel
|
|
37
|
+
from ...processing_utils import MultiModalData, ProcessingKwargs, Unpack
|
|
38
|
+
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
|
39
|
+
from ...utils import logging
|
|
40
|
+
from ...video_utils import VideoInput
|
|
41
|
+
from ..qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig, Qwen2VLTextConfig
|
|
42
|
+
from ..qwen2_vl.modeling_qwen2_vl import (
|
|
31
43
|
PatchEmbed,
|
|
32
44
|
PatchMerger,
|
|
33
45
|
Qwen2RMSNorm,
|
|
@@ -40,23 +52,7 @@ from transformers.models.qwen2_vl.modeling_qwen2_vl import (
|
|
|
40
52
|
VisionAttention,
|
|
41
53
|
VisionRotaryEmbedding,
|
|
42
54
|
)
|
|
43
|
-
from
|
|
44
|
-
|
|
45
|
-
from ...activations import ACT2FN
|
|
46
|
-
from ...cache_utils import Cache
|
|
47
|
-
from ...configuration_utils import PreTrainedConfig
|
|
48
|
-
from ...feature_extraction_utils import BatchFeature
|
|
49
|
-
from ...image_utils import ImageInput
|
|
50
|
-
from ...modeling_flash_attention_utils import is_flash_attn_available
|
|
51
|
-
from ...modeling_layers import GradientCheckpointingLayer
|
|
52
|
-
from ...processing_utils import MultiModalData, ProcessingKwargs, Unpack
|
|
53
|
-
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
|
54
|
-
from ...utils import logging
|
|
55
|
-
from ...video_utils import VideoInput
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
if is_flash_attn_available():
|
|
59
|
-
pass
|
|
55
|
+
from ..qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
|
|
60
56
|
|
|
61
57
|
|
|
62
58
|
logger = logging.get_logger(__name__)
|
|
@@ -173,7 +169,11 @@ class Qwen2_5_VLVisionBlock(GradientCheckpointingLayer):
|
|
|
173
169
|
|
|
174
170
|
|
|
175
171
|
class Qwen2_5_VLPreTrainedModel(Qwen2VLPreTrainedModel):
|
|
176
|
-
|
|
172
|
+
def _init_weights(self, module):
|
|
173
|
+
PreTrainedModel._init_weights(self, module)
|
|
174
|
+
if isinstance(module, Qwen2_5_VisionRotaryEmbedding):
|
|
175
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
176
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
177
177
|
|
|
178
178
|
|
|
179
179
|
class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
|
|
@@ -207,6 +207,8 @@ class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
|
|
|
207
207
|
)
|
|
208
208
|
self.gradient_checkpointing = False
|
|
209
209
|
|
|
210
|
+
self.post_init()
|
|
211
|
+
|
|
210
212
|
def rot_pos_emb(self, grid_thw):
|
|
211
213
|
pos_ids = []
|
|
212
214
|
for t, h, w in grid_thw:
|
|
@@ -595,7 +597,8 @@ class Qwen2_5_VLModel(Qwen2VLModel):
|
|
|
595
597
|
inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
|
|
596
598
|
|
|
597
599
|
if position_ids is None:
|
|
598
|
-
|
|
600
|
+
past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
|
|
601
|
+
if self.rope_deltas is None or past_key_values_length == 0:
|
|
599
602
|
position_ids, rope_deltas = self.get_rope_index(
|
|
600
603
|
input_ids,
|
|
601
604
|
image_grid_thw,
|
|
@@ -608,10 +611,7 @@ class Qwen2_5_VLModel(Qwen2VLModel):
|
|
|
608
611
|
batch_size, seq_length, _ = inputs_embeds.shape
|
|
609
612
|
position_ids = torch.arange(seq_length, device=inputs_embeds.device)
|
|
610
613
|
position_ids = position_ids.view(1, 1, -1).expand(3, batch_size, -1)
|
|
611
|
-
|
|
612
|
-
delta = (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
|
|
613
|
-
else:
|
|
614
|
-
delta = torch.zeros((batch_size, seq_length), device=inputs_embeds.device)
|
|
614
|
+
delta = (past_key_values_length + self.rope_deltas).to(inputs_embeds.device)
|
|
615
615
|
delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=1)
|
|
616
616
|
position_ids = position_ids + delta.to(position_ids.device)
|
|
617
617
|
|
|
@@ -778,6 +778,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
|
|
|
778
778
|
image_grid_thw=None,
|
|
779
779
|
video_grid_thw=None,
|
|
780
780
|
second_per_grid_ts=None,
|
|
781
|
+
is_first_iteration=False,
|
|
781
782
|
**kwargs,
|
|
782
783
|
):
|
|
783
784
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -795,6 +796,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
|
|
|
795
796
|
video_grid_thw=video_grid_thw,
|
|
796
797
|
second_per_grid_ts=second_per_grid_ts,
|
|
797
798
|
use_cache=use_cache,
|
|
799
|
+
is_first_iteration=is_first_iteration,
|
|
798
800
|
**kwargs,
|
|
799
801
|
)
|
|
800
802
|
|
|
@@ -804,7 +806,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
|
|
|
804
806
|
# When compiling, we can't check tensor values thus we check only input length
|
|
805
807
|
# It is safe to assume that `length!=1` means we're in pre-fill because compiled
|
|
806
808
|
# models currently cannot do assisted decoding
|
|
807
|
-
if cache_position[0] == 0 or self.model.rope_deltas is None:
|
|
809
|
+
if (cache_position[0] == 0 or not use_cache) or self.model.rope_deltas is None:
|
|
808
810
|
vision_positions, rope_deltas = self.model.get_rope_index(
|
|
809
811
|
model_inputs.get("input_ids", None),
|
|
810
812
|
image_grid_thw=image_grid_thw,
|
|
@@ -827,7 +829,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
|
|
|
827
829
|
text_positions = model_inputs["position_ids"][None, ...]
|
|
828
830
|
model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0)
|
|
829
831
|
|
|
830
|
-
if
|
|
832
|
+
if not is_first_iteration and use_cache:
|
|
831
833
|
model_inputs["pixel_values"] = None
|
|
832
834
|
model_inputs["pixel_values_videos"] = None
|
|
833
835
|
|
|
@@ -323,6 +323,7 @@ class Qwen2AudioEncoder(Qwen2AudioPreTrainedModel):
|
|
|
323
323
|
output_attentions=None,
|
|
324
324
|
output_hidden_states=None,
|
|
325
325
|
return_dict=None,
|
|
326
|
+
**kwargs,
|
|
326
327
|
):
|
|
327
328
|
r"""
|
|
328
329
|
Args:
|
|
@@ -685,6 +686,7 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi
|
|
|
685
686
|
output_hidden_states: Optional[bool] = None,
|
|
686
687
|
return_dict: Optional[bool] = None,
|
|
687
688
|
cache_position: Optional[torch.LongTensor] = None,
|
|
689
|
+
**kwargs,
|
|
688
690
|
) -> Union[tuple, Qwen2AudioCausalLMOutputWithPast]:
|
|
689
691
|
r"""
|
|
690
692
|
feature_attention_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
|
|
@@ -846,11 +848,11 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi
|
|
|
846
848
|
# Overwritten -- we should not pass input_features when we are in cached decoding stage
|
|
847
849
|
|
|
848
850
|
input_features = kwargs.pop("input_features", None)
|
|
849
|
-
|
|
851
|
+
is_first_iteration = kwargs.get("is_first_iteration", False)
|
|
850
852
|
|
|
851
853
|
model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
|
|
852
854
|
|
|
853
|
-
if
|
|
855
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
854
856
|
# input_features should only be passed when we are not in cached decoding stage
|
|
855
857
|
model_inputs["input_features"] = input_features
|
|
856
858
|
|
|
@@ -35,7 +35,12 @@ from ... import initialization as init
|
|
|
35
35
|
from ...activations import ACT2FN
|
|
36
36
|
from ...cache_utils import Cache, DynamicCache
|
|
37
37
|
from ...generation import GenerationMixin
|
|
38
|
-
from ...integrations import
|
|
38
|
+
from ...integrations import (
|
|
39
|
+
use_experts_implementation,
|
|
40
|
+
use_kernel_forward_from_hub,
|
|
41
|
+
use_kernel_func_from_hub,
|
|
42
|
+
use_kernelized_func,
|
|
43
|
+
)
|
|
39
44
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
40
45
|
from ...modeling_layers import (
|
|
41
46
|
GenericForQuestionAnswering,
|
|
@@ -47,8 +52,8 @@ from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPas
|
|
|
47
52
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
48
53
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
49
54
|
from ...processing_utils import Unpack
|
|
50
|
-
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
51
|
-
from ...utils.generic import OutputRecorder, check_model_inputs
|
|
55
|
+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
|
|
56
|
+
from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
|
|
52
57
|
from .configuration_qwen2_moe import Qwen2MoeConfig
|
|
53
58
|
|
|
54
59
|
|
|
@@ -90,7 +95,7 @@ class Qwen2MoeRotaryEmbedding(nn.Module):
|
|
|
90
95
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
91
96
|
|
|
92
97
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
93
|
-
self.original_inv_freq =
|
|
98
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
94
99
|
|
|
95
100
|
@staticmethod
|
|
96
101
|
def compute_default_rope_parameters(
|
|
@@ -129,7 +134,7 @@ class Qwen2MoeRotaryEmbedding(nn.Module):
|
|
|
129
134
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
130
135
|
|
|
131
136
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
132
|
-
with
|
|
137
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
133
138
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
134
139
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
135
140
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -227,6 +232,7 @@ def eager_attention_forward(
|
|
|
227
232
|
return attn_output, attn_weights
|
|
228
233
|
|
|
229
234
|
|
|
235
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
230
236
|
class Qwen2MoeAttention(nn.Module):
|
|
231
237
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
232
238
|
|
|
@@ -244,7 +250,6 @@ class Qwen2MoeAttention(nn.Module):
|
|
|
244
250
|
self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.qkv_bias)
|
|
245
251
|
self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.qkv_bias)
|
|
246
252
|
self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
|
|
247
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
248
253
|
if self.config.layer_types[layer_idx] == "sliding_attention":
|
|
249
254
|
self.sliding_window = config.sliding_window
|
|
250
255
|
|
|
@@ -292,6 +297,7 @@ class Qwen2MoeAttention(nn.Module):
|
|
|
292
297
|
return attn_output, attn_weights
|
|
293
298
|
|
|
294
299
|
|
|
300
|
+
@use_experts_implementation
|
|
295
301
|
class Qwen2MoeExperts(nn.Module):
|
|
296
302
|
"""Collection of expert weights stored as 3D tensors."""
|
|
297
303
|
|
|
@@ -432,7 +438,9 @@ class Qwen2MoePreTrainedModel(PreTrainedModel):
|
|
|
432
438
|
_supports_flash_attn = True
|
|
433
439
|
_supports_sdpa = True
|
|
434
440
|
_supports_flex_attn = True
|
|
435
|
-
_can_compile_fullgraph =
|
|
441
|
+
_can_compile_fullgraph = (
|
|
442
|
+
is_grouped_mm_available()
|
|
443
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
436
444
|
_supports_attention_backend = True
|
|
437
445
|
_can_record_outputs = {
|
|
438
446
|
"router_logits": OutputRecorder(Qwen2MoeTopKRouter, index=0),
|
|
@@ -218,7 +218,7 @@ class Qwen2VLTextConfig(PreTrainedConfig):
|
|
|
218
218
|
bos_token_id=bos_token_id,
|
|
219
219
|
eos_token_id=eos_token_id,
|
|
220
220
|
pad_token_id=pad_token_id,
|
|
221
|
-
ignore_keys_at_rope_validation={"
|
|
221
|
+
ignore_keys_at_rope_validation={"mrope_section"},
|
|
222
222
|
**kwargs,
|
|
223
223
|
)
|
|
224
224
|
|
|
@@ -159,8 +159,9 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
|
|
|
159
159
|
**kwargs,
|
|
160
160
|
) -> None:
|
|
161
161
|
super().__init__(**kwargs)
|
|
162
|
-
if size is not None
|
|
163
|
-
|
|
162
|
+
if size is not None:
|
|
163
|
+
if "shortest_edge" not in size or "longest_edge" not in size:
|
|
164
|
+
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
|
164
165
|
else:
|
|
165
166
|
size = {"shortest_edge": 56 * 56, "longest_edge": 28 * 28 * 1280}
|
|
166
167
|
# backward compatibility: override size with min_pixels and max_pixels if they are provided
|