transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -46,6 +46,7 @@ from ...modeling_rope_utils import (
|
|
|
46
46
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
47
47
|
from ...processing_utils import Unpack
|
|
48
48
|
from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
|
|
49
|
+
from ...utils.generic import maybe_autocast
|
|
49
50
|
from .configuration_persimmon import PersimmonConfig
|
|
50
51
|
|
|
51
52
|
|
|
@@ -76,7 +77,7 @@ class PersimmonRotaryEmbedding(nn.Module):
|
|
|
76
77
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
77
78
|
|
|
78
79
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
79
|
-
self.original_inv_freq =
|
|
80
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
80
81
|
|
|
81
82
|
@staticmethod
|
|
82
83
|
# Ignore copy
|
|
@@ -118,7 +119,7 @@ class PersimmonRotaryEmbedding(nn.Module):
|
|
|
118
119
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
119
120
|
|
|
120
121
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
121
|
-
with
|
|
122
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
122
123
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
123
124
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
124
125
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -13,7 +13,7 @@ import torch.nn as nn
|
|
|
13
13
|
from ...activations import ACT2FN
|
|
14
14
|
from ...cache_utils import Cache, DynamicCache
|
|
15
15
|
from ...generation import GenerationMixin
|
|
16
|
-
from ...integrations import use_kernel_func_from_hub
|
|
16
|
+
from ...integrations import use_kernel_func_from_hub, use_kernelized_func
|
|
17
17
|
from ...masking_utils import create_causal_mask
|
|
18
18
|
from ...modeling_layers import (
|
|
19
19
|
GenericForSequenceClassification,
|
|
@@ -25,7 +25,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
25
25
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
26
26
|
from ...processing_utils import Unpack
|
|
27
27
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
|
|
28
|
-
from ...utils.generic import check_model_inputs
|
|
28
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
29
29
|
from .configuration_phi import PhiConfig
|
|
30
30
|
|
|
31
31
|
|
|
@@ -49,7 +49,7 @@ class PhiRotaryEmbedding(nn.Module):
|
|
|
49
49
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
50
50
|
|
|
51
51
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
52
|
-
self.original_inv_freq =
|
|
52
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
53
53
|
|
|
54
54
|
@staticmethod
|
|
55
55
|
def compute_default_rope_parameters(
|
|
@@ -90,7 +90,7 @@ class PhiRotaryEmbedding(nn.Module):
|
|
|
90
90
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
91
91
|
|
|
92
92
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
93
|
-
with
|
|
93
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
94
94
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
95
95
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
96
96
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -172,6 +172,7 @@ def eager_attention_forward(
|
|
|
172
172
|
return attn_output, attn_weights
|
|
173
173
|
|
|
174
174
|
|
|
175
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
175
176
|
class PhiAttention(nn.Module):
|
|
176
177
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
177
178
|
|
|
@@ -187,7 +188,6 @@ class PhiAttention(nn.Module):
|
|
|
187
188
|
self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
|
|
188
189
|
self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
|
|
189
190
|
self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
|
|
190
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
191
191
|
self.dense = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=True)
|
|
192
192
|
self.rotary_ndims = int(self.head_dim * config.rope_parameters["partial_rotary_factor"])
|
|
193
193
|
self.qk_layernorm = config.qk_layernorm
|
|
@@ -206,7 +206,6 @@ class PhiAttention(nn.Module):
|
|
|
206
206
|
attention_mask: Optional[torch.Tensor],
|
|
207
207
|
past_key_values: Optional[Cache] = None,
|
|
208
208
|
cache_position: Optional[torch.LongTensor] = None,
|
|
209
|
-
position_ids: Optional[torch.LongTensor] = None,
|
|
210
209
|
**kwargs,
|
|
211
210
|
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
|
212
211
|
input_shape = hidden_states.shape[:-1]
|
|
@@ -92,7 +92,6 @@ class PhiAttention(LlamaAttention):
|
|
|
92
92
|
attention_mask: Optional[torch.Tensor],
|
|
93
93
|
past_key_values: Optional[Cache] = None,
|
|
94
94
|
cache_position: Optional[torch.LongTensor] = None,
|
|
95
|
-
position_ids: Optional[torch.LongTensor] = None,
|
|
96
95
|
**kwargs,
|
|
97
96
|
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
|
98
97
|
input_shape = hidden_states.shape[:-1]
|
|
@@ -44,6 +44,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
44
44
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
45
45
|
from ...processing_utils import Unpack
|
|
46
46
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
47
|
+
from ...utils.generic import maybe_autocast
|
|
47
48
|
from .configuration_phi3 import Phi3Config
|
|
48
49
|
|
|
49
50
|
|
|
@@ -82,7 +83,7 @@ class Phi3RotaryEmbedding(nn.Module):
|
|
|
82
83
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
83
84
|
|
|
84
85
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
85
|
-
self.original_inv_freq =
|
|
86
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
86
87
|
|
|
87
88
|
@staticmethod
|
|
88
89
|
def compute_default_rope_parameters(
|
|
@@ -123,7 +124,7 @@ class Phi3RotaryEmbedding(nn.Module):
|
|
|
123
124
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
124
125
|
|
|
125
126
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
126
|
-
with
|
|
127
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
127
128
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
128
129
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
129
130
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -47,7 +47,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
47
47
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
48
48
|
from ...processing_utils import Unpack
|
|
49
49
|
from ...utils import auto_docstring, can_return_tuple, torch_int
|
|
50
|
-
from ...utils.generic import TransformersKwargs, check_model_inputs
|
|
50
|
+
from ...utils.generic import TransformersKwargs, check_model_inputs, maybe_autocast
|
|
51
51
|
from .configuration_phi4_multimodal import Phi4MultimodalAudioConfig, Phi4MultimodalConfig, Phi4MultimodalVisionConfig
|
|
52
52
|
|
|
53
53
|
|
|
@@ -602,7 +602,7 @@ class Phi4MultimodalImageEmbedding(nn.Module):
|
|
|
602
602
|
|
|
603
603
|
# Temporarily disable autocast to avoid issue on bf16 tensors
|
|
604
604
|
# Ref: https://github.com/pytorch/pytorch/issues/132715
|
|
605
|
-
with
|
|
605
|
+
with maybe_autocast(device_type=inputs_embeds.device.type, enabled=False):
|
|
606
606
|
image_embeds = inputs_embeds.index_put(
|
|
607
607
|
indices=positions_tuple, values=merged_img_set_tensor, accumulate=False
|
|
608
608
|
)
|
|
@@ -881,6 +881,9 @@ class Phi4MultimodalAudioPreTrainedModel(PreTrainedModel):
|
|
|
881
881
|
if isinstance(module, Phi4MultimodalAudioGluPointWiseConv):
|
|
882
882
|
init.zeros_(module.b1)
|
|
883
883
|
init.zeros_(module.b2)
|
|
884
|
+
elif isinstance(module, Phi4MultimodalAudioMeanVarianceNormLayer):
|
|
885
|
+
init.zeros_(module.global_mean)
|
|
886
|
+
init.ones_(module.global_invstd)
|
|
884
887
|
|
|
885
888
|
|
|
886
889
|
def unfold_tensor(tensor, max_seq_len):
|
|
@@ -1014,7 +1017,7 @@ class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):
|
|
|
1014
1017
|
pad_mask = pad_mask & enc_streaming_mask
|
|
1015
1018
|
return pad_mask
|
|
1016
1019
|
|
|
1017
|
-
def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor]):
|
|
1020
|
+
def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor], **kwargs):
|
|
1018
1021
|
hidden_states = self.encoder_embedding(hidden_states)
|
|
1019
1022
|
hidden_states, hs_mask, mask = self.forward_embeddings(hidden_states, mask)
|
|
1020
1023
|
|
|
@@ -1116,7 +1119,7 @@ class Phi4MultimodalAudioEmbedding(nn.Module):
|
|
|
1116
1119
|
merged_audio_embeds = merged_audio_embeds.to(dtype=inputs_embeds.dtype, device=inputs_embeds.device)
|
|
1117
1120
|
# Temporarily disable autocast to avoid issue on bf16 tensors
|
|
1118
1121
|
# Ref: https://github.com/pytorch/pytorch/issues/132715
|
|
1119
|
-
with
|
|
1122
|
+
with maybe_autocast(device_type=inputs_embeds.device.type, enabled=False):
|
|
1120
1123
|
audio_embeds = inputs_embeds.index_put(
|
|
1121
1124
|
indices=positions_tuple, values=merged_audio_embeds, accumulate=False
|
|
1122
1125
|
)
|
|
@@ -1459,7 +1462,7 @@ class Phi4MultimodalRotaryEmbedding(nn.Module):
|
|
|
1459
1462
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
1460
1463
|
|
|
1461
1464
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
1462
|
-
self.original_inv_freq =
|
|
1465
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
1463
1466
|
|
|
1464
1467
|
@staticmethod
|
|
1465
1468
|
def compute_default_rope_parameters(
|
|
@@ -1500,7 +1503,7 @@ class Phi4MultimodalRotaryEmbedding(nn.Module):
|
|
|
1500
1503
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
1501
1504
|
|
|
1502
1505
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
1503
|
-
with
|
|
1506
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
1504
1507
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
1505
1508
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
1506
1509
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -37,7 +37,7 @@ from ...modeling_rope_utils import RopeParameters
|
|
|
37
37
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
38
38
|
from ...processing_utils import Unpack
|
|
39
39
|
from ...utils import auto_docstring, logging
|
|
40
|
-
from ...utils.generic import TransformersKwargs, check_model_inputs
|
|
40
|
+
from ...utils.generic import TransformersKwargs, check_model_inputs, maybe_autocast
|
|
41
41
|
from ..phi3.configuration_phi3 import Phi3Config
|
|
42
42
|
from ..phi3.modeling_phi3 import (
|
|
43
43
|
Phi3DecoderLayer,
|
|
@@ -844,7 +844,7 @@ class Phi4MultimodalImageEmbedding(nn.Module):
|
|
|
844
844
|
|
|
845
845
|
# Temporarily disable autocast to avoid issue on bf16 tensors
|
|
846
846
|
# Ref: https://github.com/pytorch/pytorch/issues/132715
|
|
847
|
-
with
|
|
847
|
+
with maybe_autocast(device_type=inputs_embeds.device.type, enabled=False):
|
|
848
848
|
image_embeds = inputs_embeds.index_put(
|
|
849
849
|
indices=positions_tuple, values=merged_img_set_tensor, accumulate=False
|
|
850
850
|
)
|
|
@@ -1123,6 +1123,9 @@ class Phi4MultimodalAudioPreTrainedModel(PreTrainedModel):
|
|
|
1123
1123
|
if isinstance(module, Phi4MultimodalAudioGluPointWiseConv):
|
|
1124
1124
|
init.zeros_(module.b1)
|
|
1125
1125
|
init.zeros_(module.b2)
|
|
1126
|
+
elif isinstance(module, Phi4MultimodalAudioMeanVarianceNormLayer):
|
|
1127
|
+
init.zeros_(module.global_mean)
|
|
1128
|
+
init.ones_(module.global_invstd)
|
|
1126
1129
|
|
|
1127
1130
|
|
|
1128
1131
|
class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):
|
|
@@ -1205,7 +1208,7 @@ class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):
|
|
|
1205
1208
|
pad_mask = pad_mask & enc_streaming_mask
|
|
1206
1209
|
return pad_mask
|
|
1207
1210
|
|
|
1208
|
-
def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor]):
|
|
1211
|
+
def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor], **kwargs):
|
|
1209
1212
|
hidden_states = self.encoder_embedding(hidden_states)
|
|
1210
1213
|
hidden_states, hs_mask, mask = self.forward_embeddings(hidden_states, mask)
|
|
1211
1214
|
|
|
@@ -1358,7 +1361,7 @@ class Phi4MultimodalAudioEmbedding(nn.Module):
|
|
|
1358
1361
|
merged_audio_embeds = merged_audio_embeds.to(dtype=inputs_embeds.dtype, device=inputs_embeds.device)
|
|
1359
1362
|
# Temporarily disable autocast to avoid issue on bf16 tensors
|
|
1360
1363
|
# Ref: https://github.com/pytorch/pytorch/issues/132715
|
|
1361
|
-
with
|
|
1364
|
+
with maybe_autocast(device_type=inputs_embeds.device.type, enabled=False):
|
|
1362
1365
|
audio_embeds = inputs_embeds.index_put(
|
|
1363
1366
|
indices=positions_tuple, values=merged_audio_embeds, accumulate=False
|
|
1364
1367
|
)
|
|
@@ -30,15 +30,20 @@ from ... import initialization as init
|
|
|
30
30
|
from ...activations import ACT2FN
|
|
31
31
|
from ...cache_utils import Cache, DynamicCache
|
|
32
32
|
from ...generation import GenerationMixin
|
|
33
|
-
from ...integrations import
|
|
33
|
+
from ...integrations import (
|
|
34
|
+
use_experts_implementation,
|
|
35
|
+
use_kernel_forward_from_hub,
|
|
36
|
+
use_kernel_func_from_hub,
|
|
37
|
+
use_kernelized_func,
|
|
38
|
+
)
|
|
34
39
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
35
40
|
from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
|
|
36
41
|
from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
|
|
37
42
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
38
43
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
39
44
|
from ...processing_utils import Unpack
|
|
40
|
-
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
41
|
-
from ...utils.generic import OutputRecorder, check_model_inputs
|
|
45
|
+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
|
|
46
|
+
from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
|
|
42
47
|
from .configuration_phimoe import PhimoeConfig
|
|
43
48
|
|
|
44
49
|
|
|
@@ -59,7 +64,7 @@ class PhimoeRotaryEmbedding(nn.Module):
|
|
|
59
64
|
inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
|
|
60
65
|
|
|
61
66
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
62
|
-
self.original_inv_freq =
|
|
67
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
63
68
|
|
|
64
69
|
@staticmethod
|
|
65
70
|
def compute_default_rope_parameters(
|
|
@@ -113,7 +118,7 @@ class PhimoeRotaryEmbedding(nn.Module):
|
|
|
113
118
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
114
119
|
|
|
115
120
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
116
|
-
with
|
|
121
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
117
122
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
118
123
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
119
124
|
cos = emb.cos() * mscale
|
|
@@ -194,6 +199,7 @@ def eager_attention_forward(
|
|
|
194
199
|
return attn_output, attn_weights
|
|
195
200
|
|
|
196
201
|
|
|
202
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
197
203
|
class PhimoeAttention(nn.Module):
|
|
198
204
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
199
205
|
|
|
@@ -219,7 +225,6 @@ class PhimoeAttention(nn.Module):
|
|
|
219
225
|
self.o_proj = nn.Linear(
|
|
220
226
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
221
227
|
)
|
|
222
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
223
228
|
|
|
224
229
|
def forward(
|
|
225
230
|
self,
|
|
@@ -327,6 +332,7 @@ class PhimoeMultiplier(torch.autograd.Function):
|
|
|
327
332
|
)
|
|
328
333
|
|
|
329
334
|
|
|
335
|
+
@use_experts_implementation
|
|
330
336
|
class PhimoeExperts(nn.Module):
|
|
331
337
|
"""Collection of expert weights stored as 3D tensors."""
|
|
332
338
|
|
|
@@ -617,7 +623,9 @@ class PhimoePreTrainedModel(PreTrainedModel):
|
|
|
617
623
|
_supports_flash_attn = True
|
|
618
624
|
_supports_sdpa = True
|
|
619
625
|
_supports_flex_attn = True
|
|
620
|
-
_can_compile_fullgraph =
|
|
626
|
+
_can_compile_fullgraph = (
|
|
627
|
+
is_grouped_mm_available()
|
|
628
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
621
629
|
_supports_attention_backend = True
|
|
622
630
|
_can_record_outputs = {
|
|
623
631
|
"router_logits": OutputRecorder(PhimoeTopKRouter, layer_name="mlp.router", index=0),
|
|
@@ -24,7 +24,7 @@ from ...modeling_layers import (
|
|
|
24
24
|
GenericForSequenceClassification,
|
|
25
25
|
)
|
|
26
26
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
|
|
27
|
-
from ...utils.generic import OutputRecorder
|
|
27
|
+
from ...utils.generic import OutputRecorder, maybe_autocast
|
|
28
28
|
from ..llama.modeling_llama import LlamaAttention
|
|
29
29
|
from ..mixtral.modeling_mixtral import (
|
|
30
30
|
MixtralDecoderLayer,
|
|
@@ -52,7 +52,7 @@ class PhimoeRotaryEmbedding(MixtralRotaryEmbedding):
|
|
|
52
52
|
inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
|
|
53
53
|
|
|
54
54
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
55
|
-
self.original_inv_freq =
|
|
55
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
56
56
|
|
|
57
57
|
def forward(self, x, position_ids=None, layer_type=None):
|
|
58
58
|
if layer_type is not None:
|
|
@@ -74,7 +74,7 @@ class PhimoeRotaryEmbedding(MixtralRotaryEmbedding):
|
|
|
74
74
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
75
75
|
|
|
76
76
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
77
|
-
with
|
|
77
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
78
78
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
79
79
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
80
80
|
cos = emb.cos() * mscale
|
|
@@ -481,6 +481,7 @@ class Pix2StructVisionModel(Pix2StructPreTrainedModel):
|
|
|
481
481
|
output_attentions: Optional[bool] = None,
|
|
482
482
|
output_hidden_states: Optional[bool] = None,
|
|
483
483
|
return_dict: Optional[bool] = None,
|
|
484
|
+
**kwargs,
|
|
484
485
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
485
486
|
r"""
|
|
486
487
|
flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
|
|
@@ -1359,6 +1360,7 @@ class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel, GenerationMi
|
|
|
1359
1360
|
output_hidden_states: Optional[bool] = None,
|
|
1360
1361
|
return_dict: Optional[bool] = None,
|
|
1361
1362
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1363
|
+
**kwargs,
|
|
1362
1364
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]:
|
|
1363
1365
|
r"""
|
|
1364
1366
|
flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
|
|
@@ -61,10 +61,6 @@ class Pix2StructProcessor(ProcessorMixin):
|
|
|
61
61
|
An instance of ['T5Tokenizer`]. The tokenizer is a required input.
|
|
62
62
|
"""
|
|
63
63
|
|
|
64
|
-
attributes = ["image_processor", "tokenizer"]
|
|
65
|
-
image_processor_class = "Pix2StructImageProcessor"
|
|
66
|
-
tokenizer_class = ("T5Tokenizer",)
|
|
67
|
-
|
|
68
64
|
def __init__(self, image_processor, tokenizer):
|
|
69
65
|
tokenizer.return_token_type_ids = False
|
|
70
66
|
super().__init__(image_processor, tokenizer)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 Meta AI and The HuggingFace Inc. team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""Pixio model configuration"""
|
|
16
|
+
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
|
+
|
|
19
|
+
from ...utils import _LazyModule
|
|
20
|
+
from ...utils.import_utils import define_import_structure
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from .configuration_pixio import *
|
|
25
|
+
from .modeling_pixio import *
|
|
26
|
+
else:
|
|
27
|
+
import sys
|
|
28
|
+
|
|
29
|
+
_file = globals()["__file__"]
|
|
30
|
+
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
2
|
+
# This file was automatically generated from src/transformers/models/pixio/modular_pixio.py.
|
|
3
|
+
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
|
4
|
+
# the file from the modular. If any change should be done, please apply the change to the
|
|
5
|
+
# modular_pixio.py file directly. One of our CI enforces this.
|
|
6
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
7
|
+
# coding=utf-8
|
|
8
|
+
# Copyright 2025 Meta AI and The HuggingFace Inc. team. All rights reserved.
|
|
9
|
+
#
|
|
10
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
11
|
+
# you may not use this file except in compliance with the License.
|
|
12
|
+
# You may obtain a copy of the License at
|
|
13
|
+
#
|
|
14
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
15
|
+
#
|
|
16
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
17
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
18
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
19
|
+
# See the License for the specific language governing permissions and
|
|
20
|
+
# limitations under the License.
|
|
21
|
+
|
|
22
|
+
from ...configuration_utils import PreTrainedConfig
|
|
23
|
+
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PixioConfig(BackboneConfigMixin, PreTrainedConfig):
|
|
27
|
+
r"""
|
|
28
|
+
This is the configuration class to store the configuration of a [`PixioModel`]. It is used to instantiate a
|
|
29
|
+
Pixio model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
30
|
+
with the defaults will yield a similar configuration to that of the ViT
|
|
31
|
+
[facebook/pixio-huge](https://huggingface.co/facebook/pixio-huge) architecture.
|
|
32
|
+
|
|
33
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
34
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
hidden_size (`int`, *optional*, defaults to 1280):
|
|
38
|
+
Dimensionality of the encoder layers and the pooler layer.
|
|
39
|
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
|
40
|
+
Number of hidden layers in the Transformer encoder.
|
|
41
|
+
num_attention_heads (`int`, *optional*, defaults to 16):
|
|
42
|
+
Number of attention heads for each attention layer in the Transformer encoder.
|
|
43
|
+
mlp_ratio (`int`, *optional*, defaults to 4):
|
|
44
|
+
Ratio of the hidden size of the MLPs relative to the `hidden_size`.
|
|
45
|
+
n_cls_tokens (`int`, *optional*, defaults to 8):
|
|
46
|
+
Number of class tokens in the Transformer encoder.
|
|
47
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
|
48
|
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
|
49
|
+
`"relu"`, `"selu"` and `"gelu_new"` are supported.
|
|
50
|
+
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
|
|
51
|
+
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
|
52
|
+
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
|
|
53
|
+
The dropout ratio for the attention probabilities.
|
|
54
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
55
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
56
|
+
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
|
|
57
|
+
The epsilon used by the layer normalization layers.
|
|
58
|
+
image_size (`int`, *optional*, defaults to 256):
|
|
59
|
+
The size (resolution) of each image.
|
|
60
|
+
patch_size (`int`, *optional*, defaults to 16):
|
|
61
|
+
The size (resolution) of each patch.
|
|
62
|
+
num_channels (`int`, *optional*, defaults to 3):
|
|
63
|
+
The number of input channels.
|
|
64
|
+
qkv_bias (`bool`, *optional*, defaults to `True`):
|
|
65
|
+
Whether to add a bias to the queries, keys and values.
|
|
66
|
+
drop_path_rate (`float`, *optional*, defaults to 0.0):
|
|
67
|
+
Stochastic depth rate per sample (when applied in the main path of residual layers).
|
|
68
|
+
out_features (`list[str]`, *optional*):
|
|
69
|
+
If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
|
|
70
|
+
(depending on how many stages the model has). If unset and `out_indices` is set, will default to the
|
|
71
|
+
corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
|
|
72
|
+
same order as defined in the `stage_names` attribute.
|
|
73
|
+
out_indices (`list[int]`, *optional*):
|
|
74
|
+
If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
|
|
75
|
+
many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
|
|
76
|
+
If unset and `out_features` is unset, will default to the last stage. Must be in the
|
|
77
|
+
same order as defined in the `stage_names` attribute.
|
|
78
|
+
apply_layernorm (`bool`, *optional*, defaults to `True`):
|
|
79
|
+
Whether to apply layer normalization to the feature maps in case the model is used as backbone.
|
|
80
|
+
reshape_hidden_states (`bool`, *optional*, defaults to `True`):
|
|
81
|
+
Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
|
|
82
|
+
case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
|
|
83
|
+
seq_len, hidden_size)`.
|
|
84
|
+
|
|
85
|
+
Example:
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
>>> from transformers import PixioConfig, PixioModel
|
|
89
|
+
|
|
90
|
+
>>> # Initializing a Pixio pixio-huge style configuration
|
|
91
|
+
>>> configuration = PixioConfig()
|
|
92
|
+
|
|
93
|
+
>>> # Initializing a model (with random weights) from the pixio-huge style configuration
|
|
94
|
+
>>> model = PixioModel(configuration)
|
|
95
|
+
|
|
96
|
+
>>> # Accessing the model configuration
|
|
97
|
+
>>> configuration = model.config
|
|
98
|
+
```"""
|
|
99
|
+
|
|
100
|
+
model_type = "pixio"
|
|
101
|
+
|
|
102
|
+
def __init__(
|
|
103
|
+
self,
|
|
104
|
+
hidden_size=1280,
|
|
105
|
+
num_hidden_layers=32,
|
|
106
|
+
num_attention_heads=16,
|
|
107
|
+
mlp_ratio=4,
|
|
108
|
+
n_cls_tokens=8,
|
|
109
|
+
hidden_act="gelu",
|
|
110
|
+
hidden_dropout_prob=0.0,
|
|
111
|
+
attention_probs_dropout_prob=0.0,
|
|
112
|
+
initializer_range=0.02,
|
|
113
|
+
layer_norm_eps=1e-6,
|
|
114
|
+
image_size=256,
|
|
115
|
+
patch_size=16,
|
|
116
|
+
num_channels=3,
|
|
117
|
+
qkv_bias=True,
|
|
118
|
+
drop_path_rate=0.0,
|
|
119
|
+
out_features=None,
|
|
120
|
+
out_indices=None,
|
|
121
|
+
apply_layernorm=True,
|
|
122
|
+
reshape_hidden_states=True,
|
|
123
|
+
**kwargs,
|
|
124
|
+
):
|
|
125
|
+
super().__init__(**kwargs)
|
|
126
|
+
|
|
127
|
+
self.hidden_size = hidden_size
|
|
128
|
+
self.num_hidden_layers = num_hidden_layers
|
|
129
|
+
self.num_attention_heads = num_attention_heads
|
|
130
|
+
self.mlp_ratio = mlp_ratio
|
|
131
|
+
self.hidden_act = hidden_act
|
|
132
|
+
self.hidden_dropout_prob = hidden_dropout_prob
|
|
133
|
+
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
134
|
+
self.initializer_range = initializer_range
|
|
135
|
+
self.layer_norm_eps = layer_norm_eps
|
|
136
|
+
self.image_size = image_size
|
|
137
|
+
self.patch_size = patch_size
|
|
138
|
+
self.num_channels = num_channels
|
|
139
|
+
self.qkv_bias = qkv_bias
|
|
140
|
+
self.drop_path_rate = drop_path_rate
|
|
141
|
+
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
|
|
142
|
+
self._out_features, self._out_indices = get_aligned_output_features_output_indices(
|
|
143
|
+
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
|
|
144
|
+
)
|
|
145
|
+
self.apply_layernorm = apply_layernorm
|
|
146
|
+
self.reshape_hidden_states = reshape_hidden_states
|
|
147
|
+
|
|
148
|
+
self.n_cls_tokens = n_cls_tokens
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
__all__ = ["PixioConfig"]
|