transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -35,7 +35,12 @@ from ... import initialization as init
|
|
|
35
35
|
from ...activations import ACT2FN
|
|
36
36
|
from ...cache_utils import Cache, DynamicCache
|
|
37
37
|
from ...generation import GenerationMixin
|
|
38
|
-
from ...integrations import
|
|
38
|
+
from ...integrations import (
|
|
39
|
+
use_experts_implementation,
|
|
40
|
+
use_kernel_forward_from_hub,
|
|
41
|
+
use_kernel_func_from_hub,
|
|
42
|
+
use_kernelized_func,
|
|
43
|
+
)
|
|
39
44
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
40
45
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
41
46
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
@@ -49,8 +54,8 @@ from ...modeling_outputs import (
|
|
|
49
54
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
50
55
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
51
56
|
from ...processing_utils import Unpack
|
|
52
|
-
from ...utils import auto_docstring, can_return_tuple
|
|
53
|
-
from ...utils.generic import OutputRecorder, TransformersKwargs, check_model_inputs
|
|
57
|
+
from ...utils import auto_docstring, can_return_tuple, is_grouped_mm_available
|
|
58
|
+
from ...utils.generic import OutputRecorder, TransformersKwargs, check_model_inputs, maybe_autocast
|
|
54
59
|
from .configuration_qwen3_omni_moe import (
|
|
55
60
|
Qwen3OmniMoeAudioEncoderConfig,
|
|
56
61
|
Qwen3OmniMoeCode2WavConfig,
|
|
@@ -64,6 +69,27 @@ from .configuration_qwen3_omni_moe import (
|
|
|
64
69
|
)
|
|
65
70
|
|
|
66
71
|
|
|
72
|
+
class SinusoidsPositionEmbedding(nn.Module):
|
|
73
|
+
def __init__(self, length, channels, max_timescale=10000):
|
|
74
|
+
super().__init__()
|
|
75
|
+
self.length = length
|
|
76
|
+
self.channels = channels
|
|
77
|
+
self.max_timescale = max_timescale
|
|
78
|
+
if channels % 2 != 0:
|
|
79
|
+
raise ValueError("SinusoidsPositionEmbedding needs even channels input")
|
|
80
|
+
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
|
|
81
|
+
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
|
|
82
|
+
scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
|
|
83
|
+
self.register_buffer(
|
|
84
|
+
"positional_embedding",
|
|
85
|
+
torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1),
|
|
86
|
+
persistent=False,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def forward(self, seqlen: int):
|
|
90
|
+
return self.positional_embedding[:seqlen, :]
|
|
91
|
+
|
|
92
|
+
|
|
67
93
|
@auto_docstring
|
|
68
94
|
class Qwen3OmniMoePreTrainedModel(PreTrainedModel):
|
|
69
95
|
config: Qwen3OmniMoeConfig
|
|
@@ -85,6 +111,19 @@ class Qwen3OmniMoePreTrainedModel(PreTrainedModel):
|
|
|
85
111
|
init.normal_(module.experts.gate_up_proj, mean=0.0, std=std)
|
|
86
112
|
init.normal_(module.experts.down_proj, mean=0.0, std=std)
|
|
87
113
|
init.normal_(module.gate.weight, mean=0.0, std=std)
|
|
114
|
+
elif isinstance(module, Qwen3OmniMoeCode2Wav):
|
|
115
|
+
init.copy_(
|
|
116
|
+
module.code_offset,
|
|
117
|
+
torch.arange(module.config.num_quantizers).view(1, -1, 1) * module.config.codebook_size,
|
|
118
|
+
)
|
|
119
|
+
elif isinstance(module, SinusoidsPositionEmbedding):
|
|
120
|
+
log_timescale_increment = np.log(module.max_timescale) / (module.channels // 2 - 1)
|
|
121
|
+
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(module.channels // 2).float())
|
|
122
|
+
scaled_time = torch.arange(module.length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
|
|
123
|
+
init.copy_(module.positional_embedding, torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1))
|
|
124
|
+
elif isinstance(module, Qwen3OmniMoeVisionRotaryEmbedding):
|
|
125
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
126
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
88
127
|
|
|
89
128
|
|
|
90
129
|
def _get_feat_extract_output_lengths(input_lengths):
|
|
@@ -620,24 +659,6 @@ class Qwen3OmniMoeAudioEncoderLayer(GradientCheckpointingLayer):
|
|
|
620
659
|
return outputs
|
|
621
660
|
|
|
622
661
|
|
|
623
|
-
class SinusoidsPositionEmbedding(nn.Module):
|
|
624
|
-
def __init__(self, length, channels, max_timescale=10000):
|
|
625
|
-
super().__init__()
|
|
626
|
-
if channels % 2 != 0:
|
|
627
|
-
raise ValueError("SinusoidsPositionEmbedding needs even channels input")
|
|
628
|
-
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
|
|
629
|
-
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
|
|
630
|
-
scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
|
|
631
|
-
self.register_buffer(
|
|
632
|
-
"positional_embedding",
|
|
633
|
-
torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1),
|
|
634
|
-
persistent=False,
|
|
635
|
-
)
|
|
636
|
-
|
|
637
|
-
def forward(self, seqlen: int):
|
|
638
|
-
return self.positional_embedding[:seqlen, :]
|
|
639
|
-
|
|
640
|
-
|
|
641
662
|
@auto_docstring(
|
|
642
663
|
custom_intro="""
|
|
643
664
|
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
|
|
@@ -716,6 +737,7 @@ class Qwen3OmniMoeAudioEncoder(Qwen3OmniMoePreTrainedModel):
|
|
|
716
737
|
input_features,
|
|
717
738
|
feature_lens=None,
|
|
718
739
|
aftercnn_lens=None,
|
|
740
|
+
**kwargs,
|
|
719
741
|
):
|
|
720
742
|
r"""
|
|
721
743
|
feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
|
|
@@ -890,8 +912,8 @@ class Qwen3OmniMoeVisionAttention(nn.Module):
|
|
|
890
912
|
if self.config._attn_implementation != "eager":
|
|
891
913
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
892
914
|
|
|
893
|
-
if self.config._attn_implementation
|
|
894
|
-
# Flash Attention
|
|
915
|
+
if "flash" in self.config._attn_implementation:
|
|
916
|
+
# Flash Attention: Use cu_seqlens for variable length attention
|
|
895
917
|
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
|
|
896
918
|
attn_output, _ = attention_interface(
|
|
897
919
|
self,
|
|
@@ -959,6 +981,22 @@ class Qwen3OmniMoeVisionPatchMerger(nn.Module):
|
|
|
959
981
|
return hidden
|
|
960
982
|
|
|
961
983
|
|
|
984
|
+
class Qwen3OmniMoeVisionRotaryEmbedding(nn.Module):
|
|
985
|
+
inv_freq: torch.Tensor # fix linting for `register_buffer`
|
|
986
|
+
|
|
987
|
+
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
988
|
+
super().__init__()
|
|
989
|
+
self.dim = dim
|
|
990
|
+
self.theta = theta
|
|
991
|
+
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
992
|
+
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
993
|
+
|
|
994
|
+
def forward(self, seqlen: int) -> torch.Tensor:
|
|
995
|
+
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
|
|
996
|
+
freqs = torch.outer(seq, self.inv_freq)
|
|
997
|
+
return freqs
|
|
998
|
+
|
|
999
|
+
|
|
962
1000
|
class Qwen3OmniMoeVisionMLP(nn.Module):
|
|
963
1001
|
def __init__(self, config):
|
|
964
1002
|
super().__init__()
|
|
@@ -992,20 +1030,6 @@ class Qwen3OmniMoeVisionPatchEmbed(nn.Module):
|
|
|
992
1030
|
return hidden_states
|
|
993
1031
|
|
|
994
1032
|
|
|
995
|
-
class Qwen3OmniMoeVisionRotaryEmbedding(nn.Module):
|
|
996
|
-
inv_freq: torch.Tensor # fix linting for `register_buffer`
|
|
997
|
-
|
|
998
|
-
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
999
|
-
super().__init__()
|
|
1000
|
-
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
1001
|
-
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
1002
|
-
|
|
1003
|
-
def forward(self, seqlen: int) -> torch.Tensor:
|
|
1004
|
-
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
|
|
1005
|
-
freqs = torch.outer(seq, self.inv_freq)
|
|
1006
|
-
return freqs
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
1033
|
class Qwen3OmniMoeVisionBlock(GradientCheckpointingLayer):
|
|
1010
1034
|
def __init__(self, config, attn_implementation: str = "sdpa") -> None:
|
|
1011
1035
|
super().__init__()
|
|
@@ -1072,6 +1096,8 @@ class Qwen3OmniMoeVisionEncoder(Qwen3OmniMoePreTrainedModel):
|
|
|
1072
1096
|
|
|
1073
1097
|
self.gradient_checkpointing = False
|
|
1074
1098
|
|
|
1099
|
+
self.post_init()
|
|
1100
|
+
|
|
1075
1101
|
def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
|
|
1076
1102
|
merge_size = self.spatial_merge_size
|
|
1077
1103
|
|
|
@@ -1245,7 +1271,7 @@ class Qwen3OmniMoeThinkerTextRotaryEmbedding(nn.Module):
|
|
|
1245
1271
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
1246
1272
|
|
|
1247
1273
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
1248
|
-
self.original_inv_freq =
|
|
1274
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
1249
1275
|
|
|
1250
1276
|
self.mrope_section = config.rope_parameters.get("mrope_section", [24, 20, 20])
|
|
1251
1277
|
|
|
@@ -1290,7 +1316,7 @@ class Qwen3OmniMoeThinkerTextRotaryEmbedding(nn.Module):
|
|
|
1290
1316
|
position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
|
|
1291
1317
|
|
|
1292
1318
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
1293
|
-
with
|
|
1319
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
1294
1320
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
|
|
1295
1321
|
freqs = self.apply_interleaved_mrope(freqs, self.mrope_section)
|
|
1296
1322
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
@@ -1317,6 +1343,7 @@ class Qwen3OmniMoeThinkerTextRotaryEmbedding(nn.Module):
|
|
|
1317
1343
|
return freqs_t
|
|
1318
1344
|
|
|
1319
1345
|
|
|
1346
|
+
@use_experts_implementation
|
|
1320
1347
|
class Qwen3OmniMoeThinkerTextExperts(nn.Module):
|
|
1321
1348
|
"""
|
|
1322
1349
|
ModuleList of experts.
|
|
@@ -1442,6 +1469,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
|
|
1442
1469
|
return q_embed, k_embed
|
|
1443
1470
|
|
|
1444
1471
|
|
|
1472
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
1445
1473
|
class Qwen3OmniMoeThinkerTextAttention(nn.Module):
|
|
1446
1474
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
1447
1475
|
|
|
@@ -1467,7 +1495,6 @@ class Qwen3OmniMoeThinkerTextAttention(nn.Module):
|
|
|
1467
1495
|
self.o_proj = nn.Linear(
|
|
1468
1496
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
1469
1497
|
)
|
|
1470
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
1471
1498
|
self.q_norm = Qwen3OmniMoeThinkerTextRMSNorm(
|
|
1472
1499
|
self.head_dim, eps=config.rms_norm_eps
|
|
1473
1500
|
) # unlike olmo, only on the head dim!
|
|
@@ -1595,7 +1622,9 @@ class Qwen3OmniMoeThinkerTextPreTrainedModel(PreTrainedModel):
|
|
|
1595
1622
|
_supports_flash_attn = True
|
|
1596
1623
|
_supports_sdpa = True
|
|
1597
1624
|
_supports_flex_attn = True
|
|
1598
|
-
_can_compile_fullgraph =
|
|
1625
|
+
_can_compile_fullgraph = (
|
|
1626
|
+
is_grouped_mm_available()
|
|
1627
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
1599
1628
|
_supports_attention_backend = True
|
|
1600
1629
|
_can_record_outputs = {
|
|
1601
1630
|
"router_logits": OutputRecorder(Qwen3OmniMoeThinkerTextTopKRouter, layer_name="mlp.gate", index=0),
|
|
@@ -2165,11 +2194,8 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
|
|
|
2165
2194
|
audio_feature_lengths = None
|
|
2166
2195
|
|
|
2167
2196
|
if attention_mask is not None and position_ids is None:
|
|
2168
|
-
if (
|
|
2169
|
-
|
|
2170
|
-
or (cache_position is not None and cache_position[0] == 0)
|
|
2171
|
-
or self.rope_deltas is None
|
|
2172
|
-
):
|
|
2197
|
+
past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
|
|
2198
|
+
if past_key_values_length == 0 or self.rope_deltas is None:
|
|
2173
2199
|
delta0 = (1 - attention_mask).sum(dim=-1).unsqueeze(1)
|
|
2174
2200
|
position_ids, rope_deltas = self.get_rope_index(
|
|
2175
2201
|
input_ids,
|
|
@@ -2184,7 +2210,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
|
|
|
2184
2210
|
self.rope_deltas = rope_deltas
|
|
2185
2211
|
else:
|
|
2186
2212
|
batch_size, seq_length = input_ids.shape
|
|
2187
|
-
delta =
|
|
2213
|
+
delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
|
|
2188
2214
|
position_ids = torch.arange(seq_length, device=input_ids.device)
|
|
2189
2215
|
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
|
2190
2216
|
position_ids = position_ids.add(delta)
|
|
@@ -2250,6 +2276,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
|
|
|
2250
2276
|
feature_attention_mask=None,
|
|
2251
2277
|
use_audio_in_video=False,
|
|
2252
2278
|
video_second_per_grid=None,
|
|
2279
|
+
is_first_iteration=False,
|
|
2253
2280
|
**kwargs,
|
|
2254
2281
|
):
|
|
2255
2282
|
model_inputs = super().prepare_inputs_for_generation(
|
|
@@ -2268,12 +2295,13 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
|
|
|
2268
2295
|
feature_attention_mask=feature_attention_mask,
|
|
2269
2296
|
use_audio_in_video=use_audio_in_video,
|
|
2270
2297
|
video_second_per_grid=video_second_per_grid,
|
|
2298
|
+
is_first_iteration=is_first_iteration,
|
|
2271
2299
|
**kwargs,
|
|
2272
2300
|
)
|
|
2273
2301
|
|
|
2274
2302
|
model_inputs["position_ids"] = None
|
|
2275
2303
|
|
|
2276
|
-
if
|
|
2304
|
+
if not is_first_iteration and use_cache:
|
|
2277
2305
|
model_inputs["pixel_values"] = None
|
|
2278
2306
|
model_inputs["pixel_values_videos"] = None
|
|
2279
2307
|
model_inputs["input_features"] = None
|
|
@@ -2323,6 +2351,7 @@ class Qwen3OmniMoeRMSNorm(nn.Module):
|
|
|
2323
2351
|
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
|
|
2324
2352
|
|
|
2325
2353
|
|
|
2354
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
2326
2355
|
class Qwen3OmniMoeTalkerCodePredictorAttention(nn.Module):
|
|
2327
2356
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
2328
2357
|
|
|
@@ -2349,7 +2378,6 @@ class Qwen3OmniMoeTalkerCodePredictorAttention(nn.Module):
|
|
|
2349
2378
|
self.o_proj = nn.Linear(
|
|
2350
2379
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
2351
2380
|
)
|
|
2352
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
2353
2381
|
self.q_norm = Qwen3OmniMoeRMSNorm(self.head_dim, eps=config.rms_norm_eps) # unlike olmo, only on the head dim!
|
|
2354
2382
|
self.k_norm = Qwen3OmniMoeRMSNorm(
|
|
2355
2383
|
self.head_dim, eps=config.rms_norm_eps
|
|
@@ -2479,7 +2507,7 @@ class Qwen3OmniMoeRotaryEmbedding(nn.Module):
|
|
|
2479
2507
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
2480
2508
|
|
|
2481
2509
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
2482
|
-
self.original_inv_freq =
|
|
2510
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
2483
2511
|
|
|
2484
2512
|
@staticmethod
|
|
2485
2513
|
def compute_default_rope_parameters(
|
|
@@ -2518,7 +2546,7 @@ class Qwen3OmniMoeRotaryEmbedding(nn.Module):
|
|
|
2518
2546
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
2519
2547
|
|
|
2520
2548
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
2521
|
-
with
|
|
2549
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
2522
2550
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
2523
2551
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
2524
2552
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -2747,6 +2775,7 @@ class Qwen3OmniMoeTalkerTextMLP(nn.Module):
|
|
|
2747
2775
|
return down_proj
|
|
2748
2776
|
|
|
2749
2777
|
|
|
2778
|
+
@use_experts_implementation
|
|
2750
2779
|
class Qwen3OmniMoeTalkerTextExperts(nn.Module):
|
|
2751
2780
|
"""Collection of expert weights stored as 3D tensors."""
|
|
2752
2781
|
|
|
@@ -3022,9 +3051,9 @@ class Qwen3OmniMoeTalkerModel(Qwen3OmniMoePreTrainedModel):
|
|
|
3022
3051
|
|
|
3023
3052
|
@auto_docstring
|
|
3024
3053
|
class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3OmniMoeThinkerTextPreTrainedModel, GenerationMixin):
|
|
3025
|
-
_tied_weights_keys = {"
|
|
3026
|
-
_tp_plan = {"
|
|
3027
|
-
_pp_plan = {"
|
|
3054
|
+
_tied_weights_keys = {"codec_head": "model.codec_embedding.weight"}
|
|
3055
|
+
_tp_plan = {"codec_head": "colwise_rep"}
|
|
3056
|
+
_pp_plan = {"codec_head": (["hidden_states"], ["logits"])}
|
|
3028
3057
|
config_class = Qwen3OmniMoeTalkerConfig
|
|
3029
3058
|
base_model_prefix = "talker"
|
|
3030
3059
|
_no_split_modules = ["Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration"]
|
|
@@ -3103,12 +3132,9 @@ class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3OmniMoeThinkerTextPreTrain
|
|
|
3103
3132
|
if inputs_embeds is not None and inputs_embeds.shape[1] > 1:
|
|
3104
3133
|
generation_step = -1
|
|
3105
3134
|
residual_codes = None
|
|
3106
|
-
if
|
|
3107
|
-
if (
|
|
3108
|
-
|
|
3109
|
-
or (cache_position is not None and cache_position[0] == 0)
|
|
3110
|
-
or self.rope_deltas is None
|
|
3111
|
-
):
|
|
3135
|
+
if position_ids is None:
|
|
3136
|
+
past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
|
|
3137
|
+
if past_key_values_length == 0 or self.rope_deltas is None:
|
|
3112
3138
|
delta0 = (1 - attention_mask).sum(dim=-1).unsqueeze(1)
|
|
3113
3139
|
position_ids, rope_deltas = self.get_rope_index(
|
|
3114
3140
|
talker_input_ids,
|
|
@@ -3123,7 +3149,7 @@ class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3OmniMoeThinkerTextPreTrain
|
|
|
3123
3149
|
self.rope_deltas = rope_deltas
|
|
3124
3150
|
else:
|
|
3125
3151
|
batch_size, seq_length = input_ids.shape
|
|
3126
|
-
delta =
|
|
3152
|
+
delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
|
|
3127
3153
|
position_ids = torch.arange(seq_length, device=input_ids.device)
|
|
3128
3154
|
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
|
3129
3155
|
position_ids = position_ids.add(delta)
|
|
@@ -3218,15 +3244,31 @@ class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3OmniMoeThinkerTextPreTrain
|
|
|
3218
3244
|
return model_kwargs
|
|
3219
3245
|
|
|
3220
3246
|
def prepare_inputs_for_generation(
|
|
3221
|
-
self,
|
|
3247
|
+
self,
|
|
3248
|
+
input_ids,
|
|
3249
|
+
past_key_values=None,
|
|
3250
|
+
attention_mask=None,
|
|
3251
|
+
inputs_embeds=None,
|
|
3252
|
+
cache_position=None,
|
|
3253
|
+
is_first_iteration=False,
|
|
3254
|
+
**kwargs,
|
|
3222
3255
|
):
|
|
3223
3256
|
hidden_states = kwargs.pop("hidden_states", None)
|
|
3224
3257
|
inputs = super().prepare_inputs_for_generation(
|
|
3225
|
-
input_ids,
|
|
3258
|
+
input_ids,
|
|
3259
|
+
past_key_values,
|
|
3260
|
+
attention_mask,
|
|
3261
|
+
inputs_embeds,
|
|
3262
|
+
cache_position,
|
|
3263
|
+
is_first_iteration=is_first_iteration,
|
|
3264
|
+
**kwargs,
|
|
3226
3265
|
)
|
|
3227
|
-
|
|
3266
|
+
|
|
3267
|
+
# Qwen3-Omni will prepare position ids in forward with deltas
|
|
3268
|
+
inputs["position_ids"] = None
|
|
3269
|
+
|
|
3228
3270
|
# TODO(raushan, gante): Refactor this part to a utility function
|
|
3229
|
-
if
|
|
3271
|
+
if not is_first_iteration and kwargs.get("use_cache", True):
|
|
3230
3272
|
input_ids = input_ids[:, -1:]
|
|
3231
3273
|
generation_step = kwargs.get("generation_step")
|
|
3232
3274
|
trailing_text_hidden = kwargs.get("trailing_text_hidden")
|
|
@@ -3352,6 +3394,7 @@ class Qwen3OmniMoeConvNeXtBlock(nn.Module):
|
|
|
3352
3394
|
return hidden_states
|
|
3353
3395
|
|
|
3354
3396
|
|
|
3397
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
3355
3398
|
class Qwen3OmniMoeCode2WavAttention(nn.Module):
|
|
3356
3399
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
3357
3400
|
|
|
@@ -3378,7 +3421,6 @@ class Qwen3OmniMoeCode2WavAttention(nn.Module):
|
|
|
3378
3421
|
self.o_proj = nn.Linear(
|
|
3379
3422
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
3380
3423
|
)
|
|
3381
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
3382
3424
|
self.q_norm = nn.Identity()
|
|
3383
3425
|
self.k_norm = nn.Identity()
|
|
3384
3426
|
self.sliding_window = config.sliding_window
|
|
@@ -3718,7 +3760,9 @@ class Qwen3OmniMoeCode2WavDecoderBlock(Qwen3OmniMoePreTrainedModel):
|
|
|
3718
3760
|
|
|
3719
3761
|
self.block = nn.ModuleList(block)
|
|
3720
3762
|
|
|
3721
|
-
|
|
3763
|
+
self.post_init()
|
|
3764
|
+
|
|
3765
|
+
def forward(self, hidden, **kwargs):
|
|
3722
3766
|
for block in self.block:
|
|
3723
3767
|
hidden = block(hidden)
|
|
3724
3768
|
return hidden
|
|
@@ -3760,7 +3804,7 @@ class Qwen3OmniMoeCode2Wav(Qwen3OmniMoePreTrainedModel):
|
|
|
3760
3804
|
|
|
3761
3805
|
self.post_init()
|
|
3762
3806
|
|
|
3763
|
-
def forward(self, codes):
|
|
3807
|
+
def forward(self, codes, **kwargs):
|
|
3764
3808
|
if codes.shape[1] != self.config.num_quantizers:
|
|
3765
3809
|
raise ValueError(f"Expected {self.config.num_quantizers} layer of codes, got {codes.shape[1]}")
|
|
3766
3810
|
hidden = self.code_embedding(codes + self.code_offset).mean(1)
|
|
@@ -62,7 +62,11 @@ from ..qwen2_5_omni.modeling_qwen2_5_omni import (
|
|
|
62
62
|
Qwen2_5OmniThinkerForConditionalGeneration,
|
|
63
63
|
SnakeBeta,
|
|
64
64
|
)
|
|
65
|
-
from ..qwen2_5_omni.processing_qwen2_5_omni import
|
|
65
|
+
from ..qwen2_5_omni.processing_qwen2_5_omni import (
|
|
66
|
+
Qwen2_5OmniProcessor,
|
|
67
|
+
Qwen2_5OmniProcessorKwargs,
|
|
68
|
+
SinusoidsPositionEmbedding,
|
|
69
|
+
)
|
|
66
70
|
from ..qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
|
|
67
71
|
from ..qwen3.configuration_qwen3 import Qwen3Config
|
|
68
72
|
from ..qwen3.modeling_qwen3 import (
|
|
@@ -91,6 +95,7 @@ from ..qwen3_vl_moe.modeling_qwen3_vl_moe import (
|
|
|
91
95
|
Qwen3VLMoeTextRotaryEmbedding,
|
|
92
96
|
Qwen3VLMoeVisionAttention,
|
|
93
97
|
Qwen3VLMoeVisionModel,
|
|
98
|
+
Qwen3VLMoeVisionRotaryEmbedding,
|
|
94
99
|
)
|
|
95
100
|
|
|
96
101
|
|
|
@@ -668,6 +673,7 @@ class Qwen3OmniMoeTalkerConfig(PreTrainedConfig):
|
|
|
668
673
|
self.audio_start_token_id = audio_start_token_id
|
|
669
674
|
self.vision_start_token_id = vision_start_token_id
|
|
670
675
|
self.speaker_id = speaker_id
|
|
676
|
+
self.initializer_range = self.text_config.initializer_range
|
|
671
677
|
super().__init__(**kwargs)
|
|
672
678
|
|
|
673
679
|
|
|
@@ -758,6 +764,7 @@ class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig):
|
|
|
758
764
|
upsampling_ratios=(2, 2),
|
|
759
765
|
decoder_dim=1536,
|
|
760
766
|
attention_dropout=0.0,
|
|
767
|
+
initializer_range=0.02,
|
|
761
768
|
**kwargs,
|
|
762
769
|
):
|
|
763
770
|
self.codebook_size = codebook_size
|
|
@@ -777,6 +784,7 @@ class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig):
|
|
|
777
784
|
self.upsampling_ratios = upsampling_ratios
|
|
778
785
|
self.decoder_dim = decoder_dim
|
|
779
786
|
self.attention_dropout = attention_dropout
|
|
787
|
+
self.initializer_range = initializer_range
|
|
780
788
|
self.rope_parameters = rope_parameters
|
|
781
789
|
|
|
782
790
|
super().__init__(**kwargs)
|
|
@@ -865,6 +873,7 @@ class Qwen3OmniMoeConfig(PreTrainedConfig):
|
|
|
865
873
|
self.thinker_config = Qwen3OmniMoeThinkerConfig(**thinker_config)
|
|
866
874
|
self.talker_config = Qwen3OmniMoeTalkerConfig(**talker_config)
|
|
867
875
|
self.code2wav_config = Qwen3OmniMoeCode2WavConfig(**code2wav_config)
|
|
876
|
+
self.initializer_range = self.thinker_config.initializer_range
|
|
868
877
|
self.enable_audio_output = enable_audio_output
|
|
869
878
|
self.im_start_token_id = im_start_token_id
|
|
870
879
|
self.im_end_token_id = im_end_token_id
|
|
@@ -900,6 +909,19 @@ class Qwen3OmniMoePreTrainedModel(Qwen2_5OmniPreTrainedModel, PreTrainedModel):
|
|
|
900
909
|
init.normal_(module.experts.gate_up_proj, mean=0.0, std=std)
|
|
901
910
|
init.normal_(module.experts.down_proj, mean=0.0, std=std)
|
|
902
911
|
init.normal_(module.gate.weight, mean=0.0, std=std)
|
|
912
|
+
elif isinstance(module, Qwen3OmniMoeCode2Wav):
|
|
913
|
+
init.copy_(
|
|
914
|
+
module.code_offset,
|
|
915
|
+
torch.arange(module.config.num_quantizers).view(1, -1, 1) * module.config.codebook_size,
|
|
916
|
+
)
|
|
917
|
+
elif isinstance(module, SinusoidsPositionEmbedding):
|
|
918
|
+
log_timescale_increment = np.log(module.max_timescale) / (module.channels // 2 - 1)
|
|
919
|
+
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(module.channels // 2).float())
|
|
920
|
+
scaled_time = torch.arange(module.length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
|
|
921
|
+
init.copy_(module.positional_embedding, torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1))
|
|
922
|
+
elif isinstance(module, Qwen3OmniMoeVisionRotaryEmbedding):
|
|
923
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
924
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
903
925
|
|
|
904
926
|
|
|
905
927
|
class Qwen3OmniMoePreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedModelForConditionalGeneration):
|
|
@@ -1205,6 +1227,7 @@ class Qwen3OmniMoeAudioEncoder(Qwen2_5OmniAudioEncoder):
|
|
|
1205
1227
|
input_features,
|
|
1206
1228
|
feature_lens=None,
|
|
1207
1229
|
aftercnn_lens=None,
|
|
1230
|
+
**kwargs,
|
|
1208
1231
|
):
|
|
1209
1232
|
aftercnn_lens = _get_feat_extract_output_lengths(feature_lens)
|
|
1210
1233
|
chunk_num = torch.ceil(feature_lens / (self.n_window * 2)).long()
|
|
@@ -1296,6 +1319,10 @@ class Qwen3OmniMoeVisionPatchMerger(nn.Module):
|
|
|
1296
1319
|
return hidden
|
|
1297
1320
|
|
|
1298
1321
|
|
|
1322
|
+
class Qwen3OmniMoeVisionRotaryEmbedding(Qwen3VLMoeVisionRotaryEmbedding):
|
|
1323
|
+
pass
|
|
1324
|
+
|
|
1325
|
+
|
|
1299
1326
|
class Qwen3OmniMoeVisionEncoder(Qwen3VLMoeVisionModel):
|
|
1300
1327
|
config: Qwen3OmniMoeVisionEncoderConfig
|
|
1301
1328
|
_no_split_modules = ["Qwen3OmniMoeVisionBlock"]
|
|
@@ -1521,11 +1548,8 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(Qwen2_5OmniThinkerForCondition
|
|
|
1521
1548
|
audio_feature_lengths = None
|
|
1522
1549
|
|
|
1523
1550
|
if attention_mask is not None and position_ids is None:
|
|
1524
|
-
if (
|
|
1525
|
-
|
|
1526
|
-
or (cache_position is not None and cache_position[0] == 0)
|
|
1527
|
-
or self.rope_deltas is None
|
|
1528
|
-
):
|
|
1551
|
+
past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
|
|
1552
|
+
if past_key_values_length == 0 or self.rope_deltas is None:
|
|
1529
1553
|
delta0 = (1 - attention_mask).sum(dim=-1).unsqueeze(1)
|
|
1530
1554
|
position_ids, rope_deltas = self.get_rope_index(
|
|
1531
1555
|
input_ids,
|
|
@@ -1540,7 +1564,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(Qwen2_5OmniThinkerForCondition
|
|
|
1540
1564
|
self.rope_deltas = rope_deltas
|
|
1541
1565
|
else:
|
|
1542
1566
|
batch_size, seq_length = input_ids.shape
|
|
1543
|
-
delta =
|
|
1567
|
+
delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
|
|
1544
1568
|
position_ids = torch.arange(seq_length, device=input_ids.device)
|
|
1545
1569
|
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
|
1546
1570
|
position_ids = position_ids.add(delta)
|
|
@@ -1849,6 +1873,9 @@ class Qwen3OmniMoeTalkerModel(Qwen3VLMoeTextModel):
|
|
|
1849
1873
|
|
|
1850
1874
|
|
|
1851
1875
|
class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3MoeForCausalLM):
|
|
1876
|
+
_tied_weights_keys = {"codec_head": "model.codec_embedding.weight"}
|
|
1877
|
+
_tp_plan = {"codec_head": "colwise_rep"}
|
|
1878
|
+
_pp_plan = {"codec_head": (["hidden_states"], ["logits"])}
|
|
1852
1879
|
config_class = Qwen3OmniMoeTalkerConfig
|
|
1853
1880
|
base_model_prefix = "talker"
|
|
1854
1881
|
_no_split_modules = ["Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration"]
|
|
@@ -1961,12 +1988,9 @@ class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3MoeForCausalLM):
|
|
|
1961
1988
|
if inputs_embeds is not None and inputs_embeds.shape[1] > 1:
|
|
1962
1989
|
generation_step = -1
|
|
1963
1990
|
residual_codes = None
|
|
1964
|
-
if
|
|
1965
|
-
if (
|
|
1966
|
-
|
|
1967
|
-
or (cache_position is not None and cache_position[0] == 0)
|
|
1968
|
-
or self.rope_deltas is None
|
|
1969
|
-
):
|
|
1991
|
+
if position_ids is None:
|
|
1992
|
+
past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
|
|
1993
|
+
if past_key_values_length == 0 or self.rope_deltas is None:
|
|
1970
1994
|
delta0 = (1 - attention_mask).sum(dim=-1).unsqueeze(1)
|
|
1971
1995
|
position_ids, rope_deltas = self.get_rope_index(
|
|
1972
1996
|
talker_input_ids,
|
|
@@ -1981,7 +2005,7 @@ class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3MoeForCausalLM):
|
|
|
1981
2005
|
self.rope_deltas = rope_deltas
|
|
1982
2006
|
else:
|
|
1983
2007
|
batch_size, seq_length = input_ids.shape
|
|
1984
|
-
delta =
|
|
2008
|
+
delta = (past_key_values_length + self.rope_deltas).to(input_ids.device)
|
|
1985
2009
|
position_ids = torch.arange(seq_length, device=input_ids.device)
|
|
1986
2010
|
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
|
1987
2011
|
position_ids = position_ids.add(delta)
|
|
@@ -2038,15 +2062,31 @@ class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3MoeForCausalLM):
|
|
|
2038
2062
|
return model_kwargs
|
|
2039
2063
|
|
|
2040
2064
|
def prepare_inputs_for_generation(
|
|
2041
|
-
self,
|
|
2065
|
+
self,
|
|
2066
|
+
input_ids,
|
|
2067
|
+
past_key_values=None,
|
|
2068
|
+
attention_mask=None,
|
|
2069
|
+
inputs_embeds=None,
|
|
2070
|
+
cache_position=None,
|
|
2071
|
+
is_first_iteration=False,
|
|
2072
|
+
**kwargs,
|
|
2042
2073
|
):
|
|
2043
2074
|
hidden_states = kwargs.pop("hidden_states", None)
|
|
2044
2075
|
inputs = super().prepare_inputs_for_generation(
|
|
2045
|
-
input_ids,
|
|
2076
|
+
input_ids,
|
|
2077
|
+
past_key_values,
|
|
2078
|
+
attention_mask,
|
|
2079
|
+
inputs_embeds,
|
|
2080
|
+
cache_position,
|
|
2081
|
+
is_first_iteration=is_first_iteration,
|
|
2082
|
+
**kwargs,
|
|
2046
2083
|
)
|
|
2047
|
-
|
|
2084
|
+
|
|
2085
|
+
# Qwen3-Omni will prepare position ids in forward with deltas
|
|
2086
|
+
inputs["position_ids"] = None
|
|
2087
|
+
|
|
2048
2088
|
# TODO(raushan, gante): Refactor this part to a utility function
|
|
2049
|
-
if
|
|
2089
|
+
if not is_first_iteration and kwargs.get("use_cache", True):
|
|
2050
2090
|
input_ids = input_ids[:, -1:]
|
|
2051
2091
|
generation_step = kwargs.get("generation_step")
|
|
2052
2092
|
trailing_text_hidden = kwargs.get("trailing_text_hidden")
|
|
@@ -2339,7 +2379,9 @@ class Qwen3OmniMoeCode2WavDecoderBlock(Qwen3OmniMoePreTrainedModel):
|
|
|
2339
2379
|
|
|
2340
2380
|
self.block = nn.ModuleList(block)
|
|
2341
2381
|
|
|
2342
|
-
|
|
2382
|
+
self.post_init()
|
|
2383
|
+
|
|
2384
|
+
def forward(self, hidden, **kwargs):
|
|
2343
2385
|
for block in self.block:
|
|
2344
2386
|
hidden = block(hidden)
|
|
2345
2387
|
return hidden
|
|
@@ -2381,7 +2423,7 @@ class Qwen3OmniMoeCode2Wav(Qwen3OmniMoePreTrainedModel):
|
|
|
2381
2423
|
|
|
2382
2424
|
self.post_init()
|
|
2383
2425
|
|
|
2384
|
-
def forward(self, codes):
|
|
2426
|
+
def forward(self, codes, **kwargs):
|
|
2385
2427
|
if codes.shape[1] != self.config.num_quantizers:
|
|
2386
2428
|
raise ValueError(f"Expected {self.config.num_quantizers} layer of codes, got {codes.shape[1]}")
|
|
2387
2429
|
hidden = self.code_embedding(codes + self.code_offset).mean(1)
|
|
@@ -110,7 +110,7 @@ class Qwen3VLTextConfig(PreTrainedConfig):
|
|
|
110
110
|
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
|
|
111
111
|
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
|
|
112
112
|
with longer `max_position_embeddings`.
|
|
113
|
-
attention_bias (`bool`,
|
|
113
|
+
attention_bias (`bool`, *optional*, defaults to `False`):
|
|
114
114
|
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
|
115
115
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
116
116
|
The dropout ratio for the attention probabilities.
|
|
@@ -197,13 +197,13 @@ class Qwen3VLConfig(PreTrainedConfig):
|
|
|
197
197
|
vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLVisionConfig`):
|
|
198
198
|
The config object or dictionary of the vision backbone.
|
|
199
199
|
image_token_id (`int`, *optional*, defaults to 151655):
|
|
200
|
-
The
|
|
200
|
+
The token id used as the placeholder for image inputs.
|
|
201
201
|
video_token_id (`int`, *optional*, defaults to 151656):
|
|
202
|
-
The
|
|
202
|
+
The token id used as the placeholder for video inputs.
|
|
203
203
|
vision_start_token_id (`int`, *optional*, defaults to 151652):
|
|
204
|
-
The
|
|
204
|
+
The token id that marks the start of a vision segment (image or video).
|
|
205
205
|
vision_end_token_id (`int`, *optional*, defaults to 151653):
|
|
206
|
-
The
|
|
206
|
+
The token id that marks the end of a vision segment (image or video).
|
|
207
207
|
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
208
208
|
Whether to tie the word embeddings.
|
|
209
209
|
|