transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -27,10 +27,11 @@ import torch
|
|
|
27
27
|
import torch.nn as nn
|
|
28
28
|
import torch.nn.functional as F
|
|
29
29
|
|
|
30
|
+
from ... import initialization as init
|
|
30
31
|
from ...activations import ACT2FN
|
|
31
32
|
from ...cache_utils import Cache, DynamicCache
|
|
32
33
|
from ...generation import GenerationMixin
|
|
33
|
-
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
|
|
34
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
34
35
|
from ...masking_utils import create_causal_mask
|
|
35
36
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
36
37
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
@@ -38,8 +39,8 @@ from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
|
|
|
38
39
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
39
40
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
40
41
|
from ...processing_utils import Unpack
|
|
41
|
-
from ...utils import TransformersKwargs, auto_docstring,
|
|
42
|
-
from ...utils.generic import check_model_inputs
|
|
42
|
+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
43
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
43
44
|
from .configuration_qwen3_vl import Qwen3VLConfig, Qwen3VLTextConfig, Qwen3VLVisionConfig
|
|
44
45
|
|
|
45
46
|
|
|
@@ -81,6 +82,8 @@ class Qwen3VLVisionRotaryEmbedding(nn.Module):
|
|
|
81
82
|
|
|
82
83
|
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
83
84
|
super().__init__()
|
|
85
|
+
self.dim = dim
|
|
86
|
+
self.theta = theta
|
|
84
87
|
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
85
88
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
86
89
|
|
|
@@ -202,8 +205,8 @@ class Qwen3VLVisionAttention(nn.Module):
|
|
|
202
205
|
if self.config._attn_implementation != "eager":
|
|
203
206
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
204
207
|
|
|
205
|
-
if self.config._attn_implementation
|
|
206
|
-
# Flash Attention
|
|
208
|
+
if "flash" in self.config._attn_implementation:
|
|
209
|
+
# Flash Attention: Use cu_seqlens for variable length attention
|
|
207
210
|
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
|
|
208
211
|
attn_output, _ = attention_interface(
|
|
209
212
|
self,
|
|
@@ -292,7 +295,7 @@ class Qwen3VLTextRotaryEmbedding(nn.Module):
|
|
|
292
295
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
293
296
|
|
|
294
297
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
295
|
-
self.original_inv_freq =
|
|
298
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
296
299
|
|
|
297
300
|
self.mrope_section = config.rope_parameters.get("mrope_section", [24, 20, 20])
|
|
298
301
|
|
|
@@ -337,7 +340,7 @@ class Qwen3VLTextRotaryEmbedding(nn.Module):
|
|
|
337
340
|
position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
|
|
338
341
|
|
|
339
342
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
340
|
-
with
|
|
343
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
341
344
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
|
|
342
345
|
freqs = self.apply_interleaved_mrope(freqs, self.mrope_section)
|
|
343
346
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
@@ -413,6 +416,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
|
|
413
416
|
return q_embed, k_embed
|
|
414
417
|
|
|
415
418
|
|
|
419
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
416
420
|
class Qwen3VLTextAttention(nn.Module):
|
|
417
421
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
418
422
|
|
|
@@ -439,7 +443,6 @@ class Qwen3VLTextAttention(nn.Module):
|
|
|
439
443
|
self.o_proj = nn.Linear(
|
|
440
444
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
441
445
|
)
|
|
442
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
443
446
|
self.q_norm = Qwen3VLTextRMSNorm(self.head_dim, eps=config.rms_norm_eps) # unlike olmo, only on the head dim!
|
|
444
447
|
self.k_norm = Qwen3VLTextRMSNorm(
|
|
445
448
|
self.head_dim, eps=config.rms_norm_eps
|
|
@@ -592,6 +595,12 @@ class Qwen3VLPreTrainedModel(PreTrainedModel):
|
|
|
592
595
|
"attentions": Qwen3VLTextAttention,
|
|
593
596
|
}
|
|
594
597
|
|
|
598
|
+
def _init_weights(self, module):
|
|
599
|
+
super()._init_weights(module)
|
|
600
|
+
if isinstance(module, Qwen3VLVisionRotaryEmbedding):
|
|
601
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
602
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
603
|
+
|
|
595
604
|
|
|
596
605
|
class Qwen3VLVisionModel(Qwen3VLPreTrainedModel):
|
|
597
606
|
config: Qwen3VLVisionConfig
|
|
@@ -632,6 +641,8 @@ class Qwen3VLVisionModel(Qwen3VLPreTrainedModel):
|
|
|
632
641
|
|
|
633
642
|
self.gradient_checkpointing = False
|
|
634
643
|
|
|
644
|
+
self.post_init()
|
|
645
|
+
|
|
635
646
|
def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
|
|
636
647
|
merge_size = self.spatial_merge_size
|
|
637
648
|
|
|
@@ -1201,44 +1212,19 @@ class Qwen3VLModel(Qwen3VLPreTrainedModel):
|
|
|
1201
1212
|
deepstack_visual_embeds = deepstack_video_embeds
|
|
1202
1213
|
|
|
1203
1214
|
if position_ids is None:
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
)
|
|
1207
|
-
if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
|
|
1208
|
-
attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2)
|
|
1209
|
-
# Only apply conversion for floating point tensors (inverted masks)
|
|
1210
|
-
if attention_mask_tensor.dtype.is_floating_point:
|
|
1211
|
-
attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
|
|
1212
|
-
attention_mask_tensor = (1.0 - attention_mask_tensor).int()
|
|
1213
|
-
|
|
1214
|
-
# Calculate RoPE index once per generation in the pre-fill stage only.
|
|
1215
|
-
# When compiling, we can't check tensor values thus we check only input length
|
|
1216
|
-
# It is safe to assume that `length!=1` means we're in pre-fill because compiled
|
|
1217
|
-
# models currently cannot do asssisted decoding
|
|
1218
|
-
prefill_compiled_stage = is_torchdynamo_compiling() and (
|
|
1219
|
-
(input_ids is not None and input_ids.shape[1] != 1)
|
|
1220
|
-
or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
|
|
1221
|
-
)
|
|
1222
|
-
prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
|
|
1223
|
-
(cache_position is not None and cache_position[0] == 0)
|
|
1224
|
-
or (past_key_values is None or past_key_values.get_seq_length() == 0)
|
|
1225
|
-
)
|
|
1226
|
-
if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
|
|
1215
|
+
past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
|
|
1216
|
+
if self.rope_deltas is None or past_key_values_length == 0:
|
|
1227
1217
|
position_ids, rope_deltas = self.get_rope_index(
|
|
1228
1218
|
input_ids,
|
|
1229
1219
|
image_grid_thw,
|
|
1230
1220
|
video_grid_thw,
|
|
1231
|
-
attention_mask=
|
|
1221
|
+
attention_mask=attention_mask,
|
|
1232
1222
|
)
|
|
1233
1223
|
self.rope_deltas = rope_deltas
|
|
1234
1224
|
# then use the prev pre-calculated rope-deltas to get the correct position ids
|
|
1235
1225
|
else:
|
|
1236
1226
|
batch_size, seq_length, _ = inputs_embeds.shape
|
|
1237
|
-
delta = (
|
|
1238
|
-
(cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
|
|
1239
|
-
if cache_position is not None
|
|
1240
|
-
else 0
|
|
1241
|
-
)
|
|
1227
|
+
delta = (past_key_values_length + self.rope_deltas).to(inputs_embeds.device)
|
|
1242
1228
|
position_ids = torch.arange(seq_length, device=inputs_embeds.device)
|
|
1243
1229
|
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
|
1244
1230
|
if cache_position is not None: # otherwise `deltas` is an int `0`
|
|
@@ -1322,7 +1308,7 @@ class Qwen3VLForConditionalGeneration(Qwen3VLPreTrainedModel, GenerationMixin):
|
|
|
1322
1308
|
def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
|
|
1323
1309
|
return self.model.get_image_features(pixel_values, image_grid_thw)
|
|
1324
1310
|
|
|
1325
|
-
@
|
|
1311
|
+
@can_return_tuple
|
|
1326
1312
|
def forward(
|
|
1327
1313
|
self,
|
|
1328
1314
|
input_ids: torch.LongTensor = None,
|
|
@@ -1414,6 +1400,8 @@ class Qwen3VLForConditionalGeneration(Qwen3VLPreTrainedModel, GenerationMixin):
|
|
|
1414
1400
|
loss=loss,
|
|
1415
1401
|
logits=logits,
|
|
1416
1402
|
past_key_values=outputs.past_key_values,
|
|
1403
|
+
hidden_states=outputs.hidden_states,
|
|
1404
|
+
attentions=outputs.attentions,
|
|
1417
1405
|
rope_deltas=outputs.rope_deltas,
|
|
1418
1406
|
)
|
|
1419
1407
|
|
|
@@ -1430,6 +1418,7 @@ class Qwen3VLForConditionalGeneration(Qwen3VLPreTrainedModel, GenerationMixin):
|
|
|
1430
1418
|
pixel_values_videos=None,
|
|
1431
1419
|
image_grid_thw=None,
|
|
1432
1420
|
video_grid_thw=None,
|
|
1421
|
+
is_first_iteration=False,
|
|
1433
1422
|
**kwargs,
|
|
1434
1423
|
):
|
|
1435
1424
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1446,13 +1435,39 @@ class Qwen3VLForConditionalGeneration(Qwen3VLPreTrainedModel, GenerationMixin):
|
|
|
1446
1435
|
image_grid_thw=image_grid_thw,
|
|
1447
1436
|
video_grid_thw=video_grid_thw,
|
|
1448
1437
|
use_cache=use_cache,
|
|
1438
|
+
is_first_iteration=is_first_iteration,
|
|
1449
1439
|
**kwargs,
|
|
1450
1440
|
)
|
|
1451
1441
|
|
|
1452
|
-
# Qwen3VL position_ids are
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1442
|
+
# Qwen3VL position_ids are prepared with rope_deltas
|
|
1443
|
+
if position_ids is None:
|
|
1444
|
+
# Calculate RoPE index once per generation in the pre-fill stage only.
|
|
1445
|
+
# When compiling, we can't check tensor values thus we check only input length
|
|
1446
|
+
# It is safe to assume that `length!=1` means we're in pre-fill because compiled
|
|
1447
|
+
# models currently cannot do asssisted decoding
|
|
1448
|
+
if model_inputs["cache_position"][0] == 0 or self.model.rope_deltas is None:
|
|
1449
|
+
vision_positions, rope_deltas = self.model.get_rope_index(
|
|
1450
|
+
model_inputs.get("input_ids", None),
|
|
1451
|
+
image_grid_thw=image_grid_thw,
|
|
1452
|
+
video_grid_thw=video_grid_thw,
|
|
1453
|
+
attention_mask=attention_mask,
|
|
1454
|
+
)
|
|
1455
|
+
self.model.rope_deltas = rope_deltas
|
|
1456
|
+
# then use the prev pre-calculated rope-deltas to get the correct position ids
|
|
1457
|
+
elif "position_ids" in model_inputs:
|
|
1458
|
+
batch_size, seq_length = model_inputs["position_ids"].shape
|
|
1459
|
+
device = model_inputs["position_ids"].device
|
|
1460
|
+
position_ids = torch.arange(seq_length, device=device)
|
|
1461
|
+
position_ids = position_ids.view(1, 1, -1).expand(3, batch_size, -1)
|
|
1462
|
+
delta = cache_position[0] + self.model.rope_deltas
|
|
1463
|
+
delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
|
|
1464
|
+
vision_positions = position_ids + delta.expand_as(position_ids)
|
|
1465
|
+
|
|
1466
|
+
# Concatenate "text + vision" positions into [4, bs, seq-len]
|
|
1467
|
+
text_positions = model_inputs["position_ids"][None, ...]
|
|
1468
|
+
model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0)
|
|
1469
|
+
|
|
1470
|
+
if not is_first_iteration and use_cache:
|
|
1456
1471
|
model_inputs["pixel_values"] = None
|
|
1457
1472
|
model_inputs["pixel_values_videos"] = None
|
|
1458
1473
|
|
|
@@ -22,6 +22,7 @@ import torch
|
|
|
22
22
|
import torch.nn as nn
|
|
23
23
|
import torch.nn.functional as F
|
|
24
24
|
|
|
25
|
+
from ... import initialization as init
|
|
25
26
|
from ...activations import ACT2FN
|
|
26
27
|
from ...cache_utils import Cache, DynamicCache
|
|
27
28
|
from ...configuration_utils import PreTrainedConfig
|
|
@@ -31,11 +32,11 @@ from ...masking_utils import create_causal_mask
|
|
|
31
32
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
32
33
|
from ...modeling_outputs import BaseModelOutputWithPast
|
|
33
34
|
from ...modeling_rope_utils import RopeParameters, dynamic_rope_update
|
|
34
|
-
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
|
35
|
+
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
35
36
|
from ...processing_utils import ProcessingKwargs, Unpack
|
|
36
37
|
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
|
37
|
-
from ...utils import auto_docstring,
|
|
38
|
-
from ...utils.generic import check_model_inputs
|
|
38
|
+
from ...utils import auto_docstring, can_return_tuple, logging
|
|
39
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
39
40
|
from ...video_utils import VideoInput
|
|
40
41
|
from ..llama.modeling_llama import LlamaRotaryEmbedding
|
|
41
42
|
from ..qwen2_5_vl.modeling_qwen2_5_vl import (
|
|
@@ -151,7 +152,7 @@ class Qwen3VLTextConfig(PreTrainedConfig):
|
|
|
151
152
|
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
|
|
152
153
|
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
|
|
153
154
|
with longer `max_position_embeddings`.
|
|
154
|
-
attention_bias (`bool`,
|
|
155
|
+
attention_bias (`bool`, *optional*, defaults to `False`):
|
|
155
156
|
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
|
156
157
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
157
158
|
The dropout ratio for the attention probabilities.
|
|
@@ -238,13 +239,13 @@ class Qwen3VLConfig(PreTrainedConfig):
|
|
|
238
239
|
vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLVisionConfig`):
|
|
239
240
|
The config object or dictionary of the vision backbone.
|
|
240
241
|
image_token_id (`int`, *optional*, defaults to 151655):
|
|
241
|
-
The
|
|
242
|
+
The token id used as the placeholder for image inputs.
|
|
242
243
|
video_token_id (`int`, *optional*, defaults to 151656):
|
|
243
|
-
The
|
|
244
|
+
The token id used as the placeholder for video inputs.
|
|
244
245
|
vision_start_token_id (`int`, *optional*, defaults to 151652):
|
|
245
|
-
The
|
|
246
|
+
The token id that marks the start of a vision segment (image or video).
|
|
246
247
|
vision_end_token_id (`int`, *optional*, defaults to 151653):
|
|
247
|
-
The
|
|
248
|
+
The token id that marks the end of a vision segment (image or video).
|
|
248
249
|
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
249
250
|
Whether to tie the word embeddings.
|
|
250
251
|
|
|
@@ -389,7 +390,7 @@ class Qwen3VLTextRotaryEmbedding(LlamaRotaryEmbedding):
|
|
|
389
390
|
position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
|
|
390
391
|
|
|
391
392
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
392
|
-
with
|
|
393
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
393
394
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
|
|
394
395
|
freqs = self.apply_interleaved_mrope(freqs, self.mrope_section)
|
|
395
396
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
@@ -488,6 +489,12 @@ class Qwen3VLPreTrainedModel(Qwen2VLPreTrainedModel):
|
|
|
488
489
|
"attentions": Qwen3VLTextAttention,
|
|
489
490
|
}
|
|
490
491
|
|
|
492
|
+
def _init_weights(self, module):
|
|
493
|
+
PreTrainedModel._init_weights(self, module)
|
|
494
|
+
if isinstance(module, Qwen3VLVisionRotaryEmbedding):
|
|
495
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
496
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
497
|
+
|
|
491
498
|
|
|
492
499
|
class Qwen3VLVisionModel(Qwen3VLPreTrainedModel):
|
|
493
500
|
config: Qwen3VLVisionConfig
|
|
@@ -528,6 +535,8 @@ class Qwen3VLVisionModel(Qwen3VLPreTrainedModel):
|
|
|
528
535
|
|
|
529
536
|
self.gradient_checkpointing = False
|
|
530
537
|
|
|
538
|
+
self.post_init()
|
|
539
|
+
|
|
531
540
|
def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
|
|
532
541
|
merge_size = self.spatial_merge_size
|
|
533
542
|
|
|
@@ -1033,44 +1042,19 @@ class Qwen3VLModel(Qwen2_5_VLModel):
|
|
|
1033
1042
|
deepstack_visual_embeds = deepstack_video_embeds
|
|
1034
1043
|
|
|
1035
1044
|
if position_ids is None:
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
)
|
|
1039
|
-
if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
|
|
1040
|
-
attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2)
|
|
1041
|
-
# Only apply conversion for floating point tensors (inverted masks)
|
|
1042
|
-
if attention_mask_tensor.dtype.is_floating_point:
|
|
1043
|
-
attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
|
|
1044
|
-
attention_mask_tensor = (1.0 - attention_mask_tensor).int()
|
|
1045
|
-
|
|
1046
|
-
# Calculate RoPE index once per generation in the pre-fill stage only.
|
|
1047
|
-
# When compiling, we can't check tensor values thus we check only input length
|
|
1048
|
-
# It is safe to assume that `length!=1` means we're in pre-fill because compiled
|
|
1049
|
-
# models currently cannot do asssisted decoding
|
|
1050
|
-
prefill_compiled_stage = is_torchdynamo_compiling() and (
|
|
1051
|
-
(input_ids is not None and input_ids.shape[1] != 1)
|
|
1052
|
-
or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
|
|
1053
|
-
)
|
|
1054
|
-
prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
|
|
1055
|
-
(cache_position is not None and cache_position[0] == 0)
|
|
1056
|
-
or (past_key_values is None or past_key_values.get_seq_length() == 0)
|
|
1057
|
-
)
|
|
1058
|
-
if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
|
|
1045
|
+
past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
|
|
1046
|
+
if self.rope_deltas is None or past_key_values_length == 0:
|
|
1059
1047
|
position_ids, rope_deltas = self.get_rope_index(
|
|
1060
1048
|
input_ids,
|
|
1061
1049
|
image_grid_thw,
|
|
1062
1050
|
video_grid_thw,
|
|
1063
|
-
attention_mask=
|
|
1051
|
+
attention_mask=attention_mask,
|
|
1064
1052
|
)
|
|
1065
1053
|
self.rope_deltas = rope_deltas
|
|
1066
1054
|
# then use the prev pre-calculated rope-deltas to get the correct position ids
|
|
1067
1055
|
else:
|
|
1068
1056
|
batch_size, seq_length, _ = inputs_embeds.shape
|
|
1069
|
-
delta = (
|
|
1070
|
-
(cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
|
|
1071
|
-
if cache_position is not None
|
|
1072
|
-
else 0
|
|
1073
|
-
)
|
|
1057
|
+
delta = (past_key_values_length + self.rope_deltas).to(inputs_embeds.device)
|
|
1074
1058
|
position_ids = torch.arange(seq_length, device=inputs_embeds.device)
|
|
1075
1059
|
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
|
1076
1060
|
if cache_position is not None: # otherwise `deltas` is an int `0`
|
|
@@ -1105,7 +1089,7 @@ class Qwen3VLForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
|
|
1105
1089
|
config: Qwen3VLConfig
|
|
1106
1090
|
_checkpoint_conversion_mapping = {}
|
|
1107
1091
|
|
|
1108
|
-
@
|
|
1092
|
+
@can_return_tuple
|
|
1109
1093
|
def forward(
|
|
1110
1094
|
self,
|
|
1111
1095
|
input_ids: torch.LongTensor = None,
|
|
@@ -1197,6 +1181,8 @@ class Qwen3VLForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
|
|
1197
1181
|
loss=loss,
|
|
1198
1182
|
logits=logits,
|
|
1199
1183
|
past_key_values=outputs.past_key_values,
|
|
1184
|
+
hidden_states=outputs.hidden_states,
|
|
1185
|
+
attentions=outputs.attentions,
|
|
1200
1186
|
rope_deltas=outputs.rope_deltas,
|
|
1201
1187
|
)
|
|
1202
1188
|
|
|
@@ -1213,6 +1199,7 @@ class Qwen3VLForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
|
|
1213
1199
|
pixel_values_videos=None,
|
|
1214
1200
|
image_grid_thw=None,
|
|
1215
1201
|
video_grid_thw=None,
|
|
1202
|
+
is_first_iteration=False,
|
|
1216
1203
|
**kwargs,
|
|
1217
1204
|
):
|
|
1218
1205
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1229,13 +1216,39 @@ class Qwen3VLForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
|
|
1229
1216
|
image_grid_thw=image_grid_thw,
|
|
1230
1217
|
video_grid_thw=video_grid_thw,
|
|
1231
1218
|
use_cache=use_cache,
|
|
1219
|
+
is_first_iteration=is_first_iteration,
|
|
1232
1220
|
**kwargs,
|
|
1233
1221
|
)
|
|
1234
1222
|
|
|
1235
|
-
# Qwen3VL position_ids are
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1223
|
+
# Qwen3VL position_ids are prepared with rope_deltas
|
|
1224
|
+
if position_ids is None:
|
|
1225
|
+
# Calculate RoPE index once per generation in the pre-fill stage only.
|
|
1226
|
+
# When compiling, we can't check tensor values thus we check only input length
|
|
1227
|
+
# It is safe to assume that `length!=1` means we're in pre-fill because compiled
|
|
1228
|
+
# models currently cannot do asssisted decoding
|
|
1229
|
+
if model_inputs["cache_position"][0] == 0 or self.model.rope_deltas is None:
|
|
1230
|
+
vision_positions, rope_deltas = self.model.get_rope_index(
|
|
1231
|
+
model_inputs.get("input_ids", None),
|
|
1232
|
+
image_grid_thw=image_grid_thw,
|
|
1233
|
+
video_grid_thw=video_grid_thw,
|
|
1234
|
+
attention_mask=attention_mask,
|
|
1235
|
+
)
|
|
1236
|
+
self.model.rope_deltas = rope_deltas
|
|
1237
|
+
# then use the prev pre-calculated rope-deltas to get the correct position ids
|
|
1238
|
+
elif "position_ids" in model_inputs:
|
|
1239
|
+
batch_size, seq_length = model_inputs["position_ids"].shape
|
|
1240
|
+
device = model_inputs["position_ids"].device
|
|
1241
|
+
position_ids = torch.arange(seq_length, device=device)
|
|
1242
|
+
position_ids = position_ids.view(1, 1, -1).expand(3, batch_size, -1)
|
|
1243
|
+
delta = cache_position[0] + self.model.rope_deltas
|
|
1244
|
+
delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
|
|
1245
|
+
vision_positions = position_ids + delta.expand_as(position_ids)
|
|
1246
|
+
|
|
1247
|
+
# Concatenate "text + vision" positions into [4, bs, seq-len]
|
|
1248
|
+
text_positions = model_inputs["position_ids"][None, ...]
|
|
1249
|
+
model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0)
|
|
1250
|
+
|
|
1251
|
+
if not is_first_iteration and use_cache:
|
|
1239
1252
|
model_inputs["pixel_values"] = None
|
|
1240
1253
|
model_inputs["pixel_values_videos"] = None
|
|
1241
1254
|
|
|
@@ -1391,9 +1404,9 @@ class Qwen3VLProcessor(Qwen2VLProcessor):
|
|
|
1391
1404
|
**kwargs: Unpack[Qwen3VLProcessorKwargs],
|
|
1392
1405
|
) -> BatchFeature:
|
|
1393
1406
|
"""
|
|
1394
|
-
Main method to prepare for the model one or several
|
|
1407
|
+
Main method to prepare for the model one or several sequence(s) and image(s). This method forwards the `text`
|
|
1395
1408
|
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
|
1396
|
-
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `
|
|
1409
|
+
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
|
|
1397
1410
|
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
|
|
1398
1411
|
|
|
1399
1412
|
Args:
|
|
@@ -1405,7 +1418,7 @@ class Qwen3VLProcessor(Qwen2VLProcessor):
|
|
|
1405
1418
|
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
|
1406
1419
|
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
|
1407
1420
|
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
|
1408
|
-
The
|
|
1421
|
+
The video or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
|
1409
1422
|
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
|
1410
1423
|
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
|
1411
1424
|
If set, will return tensors of a particular framework. Acceptable values are:
|
|
@@ -99,9 +99,9 @@ class Qwen3VLProcessor(ProcessorMixin):
|
|
|
99
99
|
**kwargs: Unpack[Qwen3VLProcessorKwargs],
|
|
100
100
|
) -> BatchFeature:
|
|
101
101
|
"""
|
|
102
|
-
Main method to prepare for the model one or several
|
|
102
|
+
Main method to prepare for the model one or several sequence(s) and image(s). This method forwards the `text`
|
|
103
103
|
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
|
104
|
-
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `
|
|
104
|
+
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
|
|
105
105
|
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
|
|
106
106
|
|
|
107
107
|
Args:
|
|
@@ -113,7 +113,7 @@ class Qwen3VLProcessor(ProcessorMixin):
|
|
|
113
113
|
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
|
114
114
|
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
|
115
115
|
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
|
116
|
-
The
|
|
116
|
+
The video or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
|
117
117
|
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
|
118
118
|
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
|
119
119
|
If set, will return tensors of a particular framework. Acceptable values are:
|