transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +20 -1
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +68 -5
- transformers/core_model_loading.py +201 -35
- transformers/dependency_versions_table.py +1 -1
- transformers/feature_extraction_utils.py +54 -22
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +162 -122
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +101 -64
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +2 -12
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +12 -0
- transformers/integrations/accelerate.py +44 -111
- transformers/integrations/aqlm.py +3 -5
- transformers/integrations/awq.py +2 -5
- transformers/integrations/bitnet.py +5 -8
- transformers/integrations/bitsandbytes.py +16 -15
- transformers/integrations/deepspeed.py +18 -3
- transformers/integrations/eetq.py +3 -5
- transformers/integrations/fbgemm_fp8.py +1 -1
- transformers/integrations/finegrained_fp8.py +6 -16
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/higgs.py +2 -5
- transformers/integrations/hub_kernels.py +23 -5
- transformers/integrations/integration_utils.py +35 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +4 -10
- transformers/integrations/peft.py +5 -0
- transformers/integrations/quanto.py +5 -2
- transformers/integrations/spqr.py +3 -5
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/vptq.py +3 -5
- transformers/modeling_gguf_pytorch_utils.py +66 -19
- transformers/modeling_rope_utils.py +78 -81
- transformers/modeling_utils.py +583 -503
- transformers/models/__init__.py +19 -0
- transformers/models/afmoe/modeling_afmoe.py +7 -16
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/align/modeling_align.py +12 -6
- transformers/models/altclip/modeling_altclip.py +7 -3
- transformers/models/apertus/modeling_apertus.py +4 -2
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +8 -4
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +27 -0
- transformers/models/auto/feature_extraction_auto.py +7 -3
- transformers/models/auto/image_processing_auto.py +4 -2
- transformers/models/auto/modeling_auto.py +31 -0
- transformers/models/auto/processing_auto.py +4 -0
- transformers/models/auto/tokenization_auto.py +132 -153
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +18 -19
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +9 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +3 -0
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
- transformers/models/bit/modeling_bit.py +5 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blenderbot/modeling_blenderbot.py +7 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +8 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -0
- transformers/models/bloom/modeling_bloom.py +13 -44
- transformers/models/blt/modeling_blt.py +162 -2
- transformers/models/blt/modular_blt.py +168 -3
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +6 -0
- transformers/models/bros/modeling_bros.py +8 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/canine/modeling_canine.py +6 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +9 -4
- transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +25 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clipseg/modeling_clipseg.py +4 -0
- transformers/models/clvp/modeling_clvp.py +14 -3
- transformers/models/code_llama/tokenization_code_llama.py +1 -1
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
- transformers/models/convbert/modeling_convbert.py +3 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +3 -1
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +14 -2
- transformers/models/cvt/modeling_cvt.py +5 -1
- transformers/models/cwm/modeling_cwm.py +1 -1
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +46 -39
- transformers/models/d_fine/modular_d_fine.py +15 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +1 -1
- transformers/models/dac/modeling_dac.py +4 -4
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +1 -1
- transformers/models/deberta/modeling_deberta.py +2 -0
- transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
- transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
- transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +8 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +12 -1
- transformers/models/dia/modular_dia.py +11 -0
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +3 -3
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
- transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +16 -12
- transformers/models/dots1/modeling_dots1.py +14 -5
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +5 -2
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +5 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +8 -2
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt_fast.py +46 -14
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +6 -1
- transformers/models/evolla/modeling_evolla.py +9 -1
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/falcon/modeling_falcon.py +3 -3
- transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
- transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
- transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +14 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +4 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
- transformers/models/florence2/modeling_florence2.py +20 -3
- transformers/models/florence2/modular_florence2.py +13 -0
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +16 -0
- transformers/models/gemma/modeling_gemma.py +10 -12
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma2/modular_gemma2.py +1 -1
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +28 -7
- transformers/models/gemma3/modular_gemma3.py +26 -6
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +47 -9
- transformers/models/gemma3n/modular_gemma3n.py +51 -9
- transformers/models/git/modeling_git.py +181 -126
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +15 -5
- transformers/models/glm4v/modular_glm4v.py +11 -3
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
- transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +8 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
- transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
- transformers/models/gptj/modeling_gptj.py +15 -6
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +2 -3
- transformers/models/granitemoe/modular_granitemoe.py +1 -2
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
- transformers/models/groupvit/modeling_groupvit.py +6 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
- transformers/models/hubert/modeling_hubert.py +4 -0
- transformers/models/hubert/modular_hubert.py +4 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +16 -0
- transformers/models/idefics/modeling_idefics.py +10 -0
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +9 -2
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +11 -8
- transformers/models/internvl/modular_internvl.py +5 -9
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +24 -19
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +15 -7
- transformers/models/janus/modular_janus.py +16 -7
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +14 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/configuration_lasr.py +4 -0
- transformers/models/lasr/modeling_lasr.py +3 -2
- transformers/models/lasr/modular_lasr.py +8 -1
- transformers/models/lasr/processing_lasr.py +0 -2
- transformers/models/layoutlm/modeling_layoutlm.py +5 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +18 -0
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lilt/modeling_lilt.py +19 -15
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +8 -4
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
- transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
- transformers/models/longt5/modeling_longt5.py +0 -4
- transformers/models/m2m_100/modeling_m2m_100.py +10 -0
- transformers/models/mamba/modeling_mamba.py +2 -1
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +3 -0
- transformers/models/markuplm/modeling_markuplm.py +5 -8
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +9 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +9 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mimi/modeling_mimi.py +25 -4
- transformers/models/minimax/modeling_minimax.py +16 -3
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +12 -4
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +13 -2
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +4 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
- transformers/models/modernbert/modeling_modernbert.py +12 -1
- transformers/models/modernbert/modular_modernbert.py +12 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
- transformers/models/moonshine/modeling_moonshine.py +1 -1
- transformers/models/moshi/modeling_moshi.py +21 -51
- transformers/models/mpnet/modeling_mpnet.py +2 -0
- transformers/models/mra/modeling_mra.py +4 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +0 -10
- transformers/models/musicgen/modeling_musicgen.py +5 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +1 -1
- transformers/models/nemotron/modeling_nemotron.py +3 -3
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +11 -16
- transformers/models/nystromformer/modeling_nystromformer.py +7 -0
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/olmoe/modeling_olmoe.py +12 -4
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +7 -38
- transformers/models/openai/modeling_openai.py +12 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +7 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +7 -3
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/modeling_parakeet.py +5 -0
- transformers/models/parakeet/modular_parakeet.py +5 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
- transformers/models/patchtst/modeling_patchtst.py +5 -4
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/models/pe_audio/processing_pe_audio.py +24 -0
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +3 -0
- transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +5 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +12 -4
- transformers/models/phimoe/modular_phimoe.py +1 -1
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +1 -1
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +7 -0
- transformers/models/plbart/modular_plbart.py +6 -0
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +11 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prophetnet/modeling_prophetnet.py +2 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
- transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +7 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
- transformers/models/reformer/modeling_reformer.py +9 -1
- transformers/models/regnet/modeling_regnet.py +4 -0
- transformers/models/rembert/modeling_rembert.py +7 -1
- transformers/models/resnet/modeling_resnet.py +8 -3
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +4 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +1 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +5 -1
- transformers/models/sam2/modular_sam2.py +5 -1
- transformers/models/sam2_video/modeling_sam2_video.py +51 -43
- transformers/models/sam2_video/modular_sam2_video.py +31 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +23 -0
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +3 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +2 -2
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +63 -41
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
- transformers/models/speecht5/modeling_speecht5.py +28 -0
- transformers/models/splinter/modeling_splinter.py +9 -3
- transformers/models/squeezebert/modeling_squeezebert.py +2 -0
- transformers/models/stablelm/modeling_stablelm.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/swiftformer/modeling_swiftformer.py +4 -0
- transformers/models/swin/modeling_swin.py +16 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +49 -33
- transformers/models/swinv2/modeling_swinv2.py +41 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +1 -7
- transformers/models/t5gemma/modeling_t5gemma.py +1 -1
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +1 -1
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +12 -0
- transformers/models/timesfm/modular_timesfm.py +12 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
- transformers/models/trocr/modeling_trocr.py +1 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +4 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +3 -7
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +0 -6
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +7 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/visual_bert/modeling_visual_bert.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +4 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +5 -3
- transformers/models/x_clip/modeling_x_clip.py +2 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +10 -0
- transformers/models/xlm/modeling_xlm.py +13 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +4 -1
- transformers/models/zamba/modeling_zamba.py +2 -1
- transformers/models/zamba2/modeling_zamba2.py +3 -2
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +7 -0
- transformers/pipelines/__init__.py +9 -6
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/document_question_answering.py +1 -1
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +127 -56
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +9 -64
- transformers/quantizers/quantizer_aqlm.py +1 -18
- transformers/quantizers/quantizer_auto_round.py +1 -10
- transformers/quantizers/quantizer_awq.py +3 -8
- transformers/quantizers/quantizer_bitnet.py +1 -6
- transformers/quantizers/quantizer_bnb_4bit.py +9 -49
- transformers/quantizers/quantizer_bnb_8bit.py +9 -19
- transformers/quantizers/quantizer_compressed_tensors.py +1 -4
- transformers/quantizers/quantizer_eetq.py +2 -12
- transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
- transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
- transformers/quantizers/quantizer_fp_quant.py +4 -4
- transformers/quantizers/quantizer_gptq.py +1 -4
- transformers/quantizers/quantizer_higgs.py +2 -6
- transformers/quantizers/quantizer_mxfp4.py +2 -28
- transformers/quantizers/quantizer_quanto.py +14 -14
- transformers/quantizers/quantizer_spqr.py +3 -8
- transformers/quantizers/quantizer_torchao.py +28 -124
- transformers/quantizers/quantizer_vptq.py +1 -10
- transformers/testing_utils.py +28 -12
- transformers/tokenization_mistral_common.py +3 -2
- transformers/tokenization_utils_base.py +3 -2
- transformers/tokenization_utils_tokenizers.py +25 -2
- transformers/trainer.py +24 -2
- transformers/trainer_callback.py +8 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/training_args.py +8 -10
- transformers/utils/__init__.py +4 -0
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +34 -25
- transformers/utils/generic.py +20 -0
- transformers/utils/import_utils.py +51 -9
- transformers/utils/kernel_config.py +71 -18
- transformers/utils/quantization_config.py +8 -8
- transformers/video_processing_utils.py +16 -12
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import Any, Optional
|
|
17
|
+
|
|
18
|
+
import torch
|
|
19
|
+
import torch.nn as nn
|
|
20
|
+
import torch.nn.functional as F
|
|
21
|
+
|
|
22
|
+
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
|
|
23
|
+
from ...modeling_outputs import BaseModelOutputWithPooling, MaskedLMOutput
|
|
24
|
+
from ...utils import ModelOutput, auto_docstring, can_return_tuple
|
|
25
|
+
from ...utils.generic import check_model_inputs
|
|
26
|
+
from ..auto import AutoModel, AutoModelForImageClassification
|
|
27
|
+
from ..pe_audio_video.modeling_pe_audio_video import (
|
|
28
|
+
PeAudioVideoContrastiveHead,
|
|
29
|
+
PeAudioVideoEncoder,
|
|
30
|
+
PeAudioVideoEncoderPatchEmbedder,
|
|
31
|
+
PeAudioVideoPreTrainedModel,
|
|
32
|
+
)
|
|
33
|
+
from .configuration_pe_video import PeVideoConfig, PeVideoEncoderConfig
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# TODO: not sure about the typing for text_model_output
|
|
37
|
+
@dataclass
|
|
38
|
+
# @auto_docstring
|
|
39
|
+
class PeVideoOutput(ModelOutput):
|
|
40
|
+
loss: Optional[torch.FloatTensor] = None
|
|
41
|
+
logits_video_text: Optional[torch.FloatTensor] = None
|
|
42
|
+
text_video_embeds: Optional[torch.FloatTensor] = None
|
|
43
|
+
video_embeds: Optional[torch.FloatTensor] = None
|
|
44
|
+
text_outputs: BaseModelOutputWithPooling = None
|
|
45
|
+
video_outputs: BaseModelOutputWithPooling = None
|
|
46
|
+
|
|
47
|
+
def to_tuple(self) -> tuple[Any]:
|
|
48
|
+
return tuple(
|
|
49
|
+
self[k] if k not in ["text_outputs", "video_outputs"] else getattr(self, k).to_tuple() for k in self.keys()
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class PeVideoContrastiveHead(PeAudioVideoContrastiveHead): ...
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class PeVideoEncoderPatchEmbedder(PeAudioVideoEncoderPatchEmbedder): ...
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class PeVideoEncoderEmbedder(nn.Module):
|
|
60
|
+
def __init__(self, config: PeVideoEncoderConfig):
|
|
61
|
+
super().__init__()
|
|
62
|
+
self.vision_model = AutoModelForImageClassification.from_config(config.vision_config)
|
|
63
|
+
self.proj = nn.Linear(config.vision_config.num_labels, config.hidden_size, bias=False)
|
|
64
|
+
self.data_proj = nn.Linear(config.hidden_size, config.hidden_size)
|
|
65
|
+
|
|
66
|
+
def forward(
|
|
67
|
+
self,
|
|
68
|
+
pixel_values_videos: torch.Tensor,
|
|
69
|
+
padding_mask: Optional[torch.Tensor] = None,
|
|
70
|
+
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
|
71
|
+
input_shape = pixel_values_videos.shape
|
|
72
|
+
|
|
73
|
+
pixel_values_videos = pixel_values_videos.view(-1, *input_shape[2:])
|
|
74
|
+
vision_encoder_outputs = self.vision_model(pixel_values_videos)
|
|
75
|
+
|
|
76
|
+
logits = vision_encoder_outputs.logits.view(*input_shape[:2], -1)
|
|
77
|
+
logits = F.normalize(logits, dim=-1)
|
|
78
|
+
|
|
79
|
+
vision_features = self.proj(logits)
|
|
80
|
+
inputs_embeds = self.data_proj(vision_features)
|
|
81
|
+
|
|
82
|
+
return inputs_embeds, padding_mask
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class PeVideoPreTrainedModel(PeAudioVideoPreTrainedModel):
|
|
86
|
+
base_model_prefix = "video_model"
|
|
87
|
+
main_input_name = "pixel_values_videos"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@auto_docstring(
|
|
91
|
+
custom_intro="""
|
|
92
|
+
The PeVideo Encoder model.
|
|
93
|
+
"""
|
|
94
|
+
)
|
|
95
|
+
class PeVideoEncoder(PeAudioVideoEncoder):
|
|
96
|
+
base_model_prefix = "video_model.video_encoder"
|
|
97
|
+
main_input_name = "pixel_values_videos"
|
|
98
|
+
|
|
99
|
+
def __init__(self, config: PeVideoEncoderConfig):
|
|
100
|
+
super().__init__(config)
|
|
101
|
+
self.embedder = PeVideoEncoderEmbedder(config)
|
|
102
|
+
|
|
103
|
+
@can_return_tuple
|
|
104
|
+
@check_model_inputs
|
|
105
|
+
def forward(
|
|
106
|
+
self,
|
|
107
|
+
pixel_values_videos: torch.Tensor,
|
|
108
|
+
padding_mask_videos: Optional[torch.Tensor] = None,
|
|
109
|
+
**kwargs,
|
|
110
|
+
) -> BaseModelOutputWithPooling:
|
|
111
|
+
inputs_embeds, padding_mask = self.embedder(pixel_values_videos, padding_mask=padding_mask_videos)
|
|
112
|
+
inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask)
|
|
113
|
+
|
|
114
|
+
if attention_mask is not None:
|
|
115
|
+
attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
|
|
116
|
+
|
|
117
|
+
position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0)
|
|
118
|
+
position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
|
|
119
|
+
|
|
120
|
+
hidden_states = inputs_embeds
|
|
121
|
+
for encoder_layer in self.layers[: self.config.num_hidden_layers]:
|
|
122
|
+
hidden_states = encoder_layer(
|
|
123
|
+
hidden_states,
|
|
124
|
+
attention_mask=attention_mask,
|
|
125
|
+
position_embeddings=position_embeddings,
|
|
126
|
+
**kwargs,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
hidden_states = self.norm(hidden_states)
|
|
130
|
+
hidden_states = self.output(hidden_states)
|
|
131
|
+
|
|
132
|
+
return BaseModelOutputWithPooling(
|
|
133
|
+
last_hidden_state=hidden_states[:, 1:],
|
|
134
|
+
pooler_output=hidden_states[:, 0],
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class PeVideoModel(PeVideoPreTrainedModel):
|
|
139
|
+
main_input_name = "input_ids"
|
|
140
|
+
|
|
141
|
+
def __init__(self, config: PeVideoConfig):
|
|
142
|
+
super().__init__(config)
|
|
143
|
+
self.text_model = AutoModel.from_config(config.text_config)
|
|
144
|
+
self.video_encoder = PeVideoEncoder(config.video_config)
|
|
145
|
+
|
|
146
|
+
self.text_video_head = PeVideoContrastiveHead(config.text_config.hidden_size, config.text_config.hidden_size)
|
|
147
|
+
self.video_head = PeVideoContrastiveHead(config.video_config.hidden_size, config.text_config.hidden_size)
|
|
148
|
+
|
|
149
|
+
self.text_video_logit_scale = nn.Parameter(torch.zeros(1))
|
|
150
|
+
self.text_video_logit_bias = nn.Parameter(torch.zeros(1))
|
|
151
|
+
|
|
152
|
+
self.post_init()
|
|
153
|
+
|
|
154
|
+
def get_text_features(self, input_ids, attention_mask=None):
|
|
155
|
+
# TODO: should it be named feature or embeds
|
|
156
|
+
text_outputs: MaskedLMOutput = self.text_model(
|
|
157
|
+
input_ids=input_ids,
|
|
158
|
+
attention_mask=attention_mask,
|
|
159
|
+
return_dict=True,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
text_features = text_outputs.last_hidden_state
|
|
163
|
+
text_features = self.text_video_head(text_features)
|
|
164
|
+
return text_features
|
|
165
|
+
|
|
166
|
+
def get_video_features(self, pixel_values_videos, padding_mask_videos=None):
|
|
167
|
+
# TODO: should it be named feature or embeds
|
|
168
|
+
video_outputs: BaseModelOutputWithPooling = self.video_encoder(
|
|
169
|
+
pixel_values_videos=pixel_values_videos,
|
|
170
|
+
padding_mask_videos=padding_mask_videos,
|
|
171
|
+
return_dict=True,
|
|
172
|
+
)
|
|
173
|
+
video_features = self.video_head(video_outputs.pooler_output)
|
|
174
|
+
return video_features
|
|
175
|
+
|
|
176
|
+
@can_return_tuple
|
|
177
|
+
def forward(
|
|
178
|
+
self,
|
|
179
|
+
input_ids: torch.Tensor,
|
|
180
|
+
pixel_values_videos: torch.Tensor,
|
|
181
|
+
attention_mask: Optional[torch.Tensor] = None,
|
|
182
|
+
padding_mask_videos: Optional[torch.Tensor] = None,
|
|
183
|
+
return_loss: Optional[bool] = None,
|
|
184
|
+
**kwargs,
|
|
185
|
+
) -> PeVideoOutput:
|
|
186
|
+
video_outputs: BaseModelOutputWithPooling = self.video_encoder(
|
|
187
|
+
pixel_values_videos=pixel_values_videos, padding_mask_videos=padding_mask_videos, **kwargs
|
|
188
|
+
)
|
|
189
|
+
kwargs["output_hidden_states"] = True
|
|
190
|
+
text_outputs: MaskedLMOutput = self.text_model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
|
|
191
|
+
|
|
192
|
+
video_embeds = video_outputs.pooler_output
|
|
193
|
+
video_embeds = self.video_head(video_embeds)
|
|
194
|
+
|
|
195
|
+
text_video_embeds = text_outputs.hidden_states[-1][:, 0]
|
|
196
|
+
text_video_embeds = self.text_video_head(text_video_embeds)
|
|
197
|
+
|
|
198
|
+
logits_video_text = video_embeds @ text_video_embeds.T
|
|
199
|
+
logits_video_text = logits_video_text * self.text_video_logit_scale + self.text_video_logit_bias
|
|
200
|
+
|
|
201
|
+
loss = None
|
|
202
|
+
if return_loss:
|
|
203
|
+
labels = torch.eye(logits_video_text.shape[0], device=logits_video_text.device)
|
|
204
|
+
loss = -F.logsigmoid(labels * logits_video_text).sum() / logits_video_text.shape[0]
|
|
205
|
+
|
|
206
|
+
return PeVideoOutput(
|
|
207
|
+
logits_video_text=logits_video_text,
|
|
208
|
+
text_video_embeds=text_video_embeds,
|
|
209
|
+
video_embeds=video_embeds,
|
|
210
|
+
text_outputs=text_outputs,
|
|
211
|
+
video_outputs=video_outputs,
|
|
212
|
+
loss=loss,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
__all__ = [
|
|
217
|
+
"PeVideoEncoder",
|
|
218
|
+
"PeVideoModel",
|
|
219
|
+
]
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from ...processing_utils import ProcessorMixin
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class PeVideoProcessor(ProcessorMixin):
|
|
5
|
+
attributes = ["video_processor", "tokenizer"]
|
|
6
|
+
video_processor_class = "PeVideoVideoProcessor"
|
|
7
|
+
tokenizer_class = "AutoTokenizer"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
__all__ = ["PeVideoProcessor"]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
from typing import Optional, Union
|
|
16
|
+
|
|
17
|
+
import torch
|
|
18
|
+
|
|
19
|
+
from ...image_processing_utils import BatchFeature
|
|
20
|
+
from ...image_utils import PILImageResampling
|
|
21
|
+
from ...processing_utils import Unpack, VideosKwargs
|
|
22
|
+
from ...video_processing_utils import BaseVideoProcessor, VideoMetadata
|
|
23
|
+
from ...video_utils import VideoInput
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PeVideoVideoProcessor(BaseVideoProcessor):
|
|
27
|
+
resample = PILImageResampling.BILINEAR
|
|
28
|
+
|
|
29
|
+
def sample_frames(
|
|
30
|
+
self,
|
|
31
|
+
metadata: VideoMetadata,
|
|
32
|
+
num_frames: Optional[int] = None,
|
|
33
|
+
fps: Optional[Union[int, float]] = None,
|
|
34
|
+
**kwargs,
|
|
35
|
+
):
|
|
36
|
+
if num_frames:
|
|
37
|
+
total_frames = metadata.total_num_frames
|
|
38
|
+
num_frames = num_frames if num_frames is not None else self.num_frames
|
|
39
|
+
assert num_frames is not None, "`num_frames` must be specified if `fixed_len_video == True`"
|
|
40
|
+
frame_idxs = [int(i * (total_frames - 1) / (num_frames - 1)) for i in range(num_frames)]
|
|
41
|
+
return torch.tensor(frame_idxs)
|
|
42
|
+
else:
|
|
43
|
+
return super().sample_frames(metadata, num_frames, fps, **kwargs)
|
|
44
|
+
|
|
45
|
+
def _preprocess(
|
|
46
|
+
self,
|
|
47
|
+
videos: VideoInput,
|
|
48
|
+
**kwargs: Unpack[VideosKwargs],
|
|
49
|
+
) -> BatchFeature:
|
|
50
|
+
# Always set `return_tensors` to `None` since it won't pad variable length videos
|
|
51
|
+
# We'll handle this after we call the parent' method
|
|
52
|
+
return_tensors = kwargs.pop("return_tensors", None)
|
|
53
|
+
result = super()._preprocess(videos, **kwargs)
|
|
54
|
+
pixels = result.pixel_values_videos
|
|
55
|
+
data = {"pixel_values_videos": pixels}
|
|
56
|
+
if return_tensors:
|
|
57
|
+
lengths = torch.tensor([video.size(0) for video in pixels])
|
|
58
|
+
pixels = torch.nn.utils.rnn.pad_sequence(pixels, batch_first=True, padding_value=0.0)
|
|
59
|
+
data["pixel_values_videos"] = pixels
|
|
60
|
+
if lengths.unique().size(0) > 1:
|
|
61
|
+
mask = torch.arange(lengths.max())[None] < lengths[:, None]
|
|
62
|
+
data["padding_mask_videos"] = mask
|
|
63
|
+
return BatchFeature(data=data, tensor_type=return_tensors)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
__all__ = ["PeVideoVideoProcessor"]
|
|
@@ -143,6 +143,7 @@ class PegasusConfig(PreTrainedConfig):
|
|
|
143
143
|
self.use_cache = use_cache
|
|
144
144
|
self.num_hidden_layers = encoder_layers
|
|
145
145
|
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
|
146
|
+
|
|
146
147
|
super().__init__(
|
|
147
148
|
pad_token_id=pad_token_id,
|
|
148
149
|
eos_token_id=eos_token_id,
|
|
@@ -443,6 +443,8 @@ class PegasusPreTrainedModel(PreTrainedModel):
|
|
|
443
443
|
super()._init_weights(module)
|
|
444
444
|
if isinstance(module, PegasusSinusoidalPositionalEmbedding):
|
|
445
445
|
init.copy_(module.weight, module.create_weight())
|
|
446
|
+
elif isinstance(module, PegasusForConditionalGeneration):
|
|
447
|
+
init.zeros_(module.final_logits_bias)
|
|
446
448
|
|
|
447
449
|
|
|
448
450
|
class PegasusEncoder(PegasusPreTrainedModel):
|
|
@@ -1220,6 +1222,7 @@ class PegasusDecoderWrapper(PegasusPreTrainedModel):
|
|
|
1220
1222
|
def __init__(self, config):
|
|
1221
1223
|
super().__init__(config)
|
|
1222
1224
|
self.decoder = PegasusDecoder(config)
|
|
1225
|
+
self.post_init()
|
|
1223
1226
|
|
|
1224
1227
|
def forward(self, *args, **kwargs):
|
|
1225
1228
|
return self.decoder(*args, **kwargs)
|
|
@@ -1476,6 +1476,7 @@ class PegasusXDecoderWrapper(PegasusXPreTrainedModel):
|
|
|
1476
1476
|
def __init__(self, config):
|
|
1477
1477
|
super().__init__(config)
|
|
1478
1478
|
self.decoder = PegasusXDecoder(config)
|
|
1479
|
+
self.post_init()
|
|
1479
1480
|
|
|
1480
1481
|
def forward(self, *args, **kwargs):
|
|
1481
1482
|
return self.decoder(*args, **kwargs)
|
|
@@ -113,7 +113,6 @@ class PerceiverImageProcessorFast(BaseImageProcessorFast):
|
|
|
113
113
|
processed_images_grouped[shape] = stacked_images
|
|
114
114
|
|
|
115
115
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
116
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
117
116
|
|
|
118
117
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
119
118
|
|
|
@@ -551,9 +551,13 @@ class PerceiverPreTrainedModel(PreTrainedModel):
|
|
|
551
551
|
# Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
|
|
552
552
|
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
|
|
553
553
|
init.zeros_(module.weight[module.padding_idx])
|
|
554
|
-
elif isinstance(module, nn.LayerNorm):
|
|
554
|
+
elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
|
|
555
555
|
init.zeros_(module.bias)
|
|
556
556
|
init.ones_(module.weight)
|
|
557
|
+
if getattr(module, "running_mean", None) is not None:
|
|
558
|
+
init.zeros_(module.running_mean)
|
|
559
|
+
init.ones_(module.running_var)
|
|
560
|
+
init.zeros_(module.num_batches_tracked)
|
|
557
561
|
|
|
558
562
|
|
|
559
563
|
@auto_docstring(
|
|
@@ -307,7 +307,6 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
|
|
|
307
307
|
processed_images_grouped[shape] = stacked_images
|
|
308
308
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
309
309
|
processed_images = [p[None] if p.ndim == 3 else p for p in processed_images] # add tiles dimension if needed
|
|
310
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
311
310
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
312
311
|
|
|
313
312
|
|
|
@@ -451,6 +451,7 @@ class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, Generati
|
|
|
451
451
|
attention_mask=None,
|
|
452
452
|
cache_position=None,
|
|
453
453
|
logits_to_keep=None,
|
|
454
|
+
is_first_iteration=False,
|
|
454
455
|
**kwargs,
|
|
455
456
|
):
|
|
456
457
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -462,12 +463,15 @@ class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, Generati
|
|
|
462
463
|
attention_mask=attention_mask,
|
|
463
464
|
cache_position=cache_position,
|
|
464
465
|
logits_to_keep=logits_to_keep,
|
|
466
|
+
is_first_iteration=is_first_iteration,
|
|
465
467
|
**kwargs,
|
|
466
468
|
)
|
|
467
469
|
|
|
468
|
-
if
|
|
469
|
-
#
|
|
470
|
-
#
|
|
470
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
471
|
+
# Pixel values are used only in the first iteration if available
|
|
472
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
473
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
474
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
471
475
|
model_inputs["pixel_values"] = pixel_values
|
|
472
476
|
model_inputs["pixel_values_videos"] = pixel_values_videos
|
|
473
477
|
return model_inputs
|
|
@@ -293,6 +293,7 @@ class PerceptionLMForConditionalGeneration(LlavaForConditionalGeneration):
|
|
|
293
293
|
attention_mask=None,
|
|
294
294
|
cache_position=None,
|
|
295
295
|
logits_to_keep=None,
|
|
296
|
+
is_first_iteration=False,
|
|
296
297
|
**kwargs,
|
|
297
298
|
):
|
|
298
299
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -304,12 +305,15 @@ class PerceptionLMForConditionalGeneration(LlavaForConditionalGeneration):
|
|
|
304
305
|
attention_mask=attention_mask,
|
|
305
306
|
cache_position=cache_position,
|
|
306
307
|
logits_to_keep=logits_to_keep,
|
|
308
|
+
is_first_iteration=is_first_iteration,
|
|
307
309
|
**kwargs,
|
|
308
310
|
)
|
|
309
311
|
|
|
310
|
-
if
|
|
311
|
-
#
|
|
312
|
-
#
|
|
312
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
313
|
+
# Pixel values are used only in the first iteration if available
|
|
314
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
315
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
316
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
313
317
|
model_inputs["pixel_values"] = pixel_values
|
|
314
318
|
model_inputs["pixel_values_videos"] = pixel_values_videos
|
|
315
319
|
return model_inputs
|
|
@@ -77,7 +77,7 @@ class PersimmonRotaryEmbedding(nn.Module):
|
|
|
77
77
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
78
78
|
|
|
79
79
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
80
|
-
self.original_inv_freq =
|
|
80
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
81
81
|
|
|
82
82
|
@staticmethod
|
|
83
83
|
# Ignore copy
|
|
@@ -49,7 +49,7 @@ class PhiRotaryEmbedding(nn.Module):
|
|
|
49
49
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
50
50
|
|
|
51
51
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
52
|
-
self.original_inv_freq =
|
|
52
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
53
53
|
|
|
54
54
|
@staticmethod
|
|
55
55
|
def compute_default_rope_parameters(
|
|
@@ -83,7 +83,7 @@ class Phi3RotaryEmbedding(nn.Module):
|
|
|
83
83
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
84
84
|
|
|
85
85
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
86
|
-
self.original_inv_freq =
|
|
86
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
87
87
|
|
|
88
88
|
@staticmethod
|
|
89
89
|
def compute_default_rope_parameters(
|
|
@@ -881,6 +881,9 @@ class Phi4MultimodalAudioPreTrainedModel(PreTrainedModel):
|
|
|
881
881
|
if isinstance(module, Phi4MultimodalAudioGluPointWiseConv):
|
|
882
882
|
init.zeros_(module.b1)
|
|
883
883
|
init.zeros_(module.b2)
|
|
884
|
+
elif isinstance(module, Phi4MultimodalAudioMeanVarianceNormLayer):
|
|
885
|
+
init.zeros_(module.global_mean)
|
|
886
|
+
init.ones_(module.global_invstd)
|
|
884
887
|
|
|
885
888
|
|
|
886
889
|
def unfold_tensor(tensor, max_seq_len):
|
|
@@ -1459,7 +1462,7 @@ class Phi4MultimodalRotaryEmbedding(nn.Module):
|
|
|
1459
1462
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
1460
1463
|
|
|
1461
1464
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
1462
|
-
self.original_inv_freq =
|
|
1465
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
1463
1466
|
|
|
1464
1467
|
@staticmethod
|
|
1465
1468
|
def compute_default_rope_parameters(
|
|
@@ -1123,6 +1123,9 @@ class Phi4MultimodalAudioPreTrainedModel(PreTrainedModel):
|
|
|
1123
1123
|
if isinstance(module, Phi4MultimodalAudioGluPointWiseConv):
|
|
1124
1124
|
init.zeros_(module.b1)
|
|
1125
1125
|
init.zeros_(module.b2)
|
|
1126
|
+
elif isinstance(module, Phi4MultimodalAudioMeanVarianceNormLayer):
|
|
1127
|
+
init.zeros_(module.global_mean)
|
|
1128
|
+
init.ones_(module.global_invstd)
|
|
1126
1129
|
|
|
1127
1130
|
|
|
1128
1131
|
class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):
|
|
@@ -30,14 +30,19 @@ from ... import initialization as init
|
|
|
30
30
|
from ...activations import ACT2FN
|
|
31
31
|
from ...cache_utils import Cache, DynamicCache
|
|
32
32
|
from ...generation import GenerationMixin
|
|
33
|
-
from ...integrations import
|
|
33
|
+
from ...integrations import (
|
|
34
|
+
use_experts_implementation,
|
|
35
|
+
use_kernel_forward_from_hub,
|
|
36
|
+
use_kernel_func_from_hub,
|
|
37
|
+
use_kernelized_func,
|
|
38
|
+
)
|
|
34
39
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
35
40
|
from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
|
|
36
41
|
from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
|
|
37
42
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
38
43
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
39
44
|
from ...processing_utils import Unpack
|
|
40
|
-
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
45
|
+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
|
|
41
46
|
from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
|
|
42
47
|
from .configuration_phimoe import PhimoeConfig
|
|
43
48
|
|
|
@@ -59,7 +64,7 @@ class PhimoeRotaryEmbedding(nn.Module):
|
|
|
59
64
|
inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
|
|
60
65
|
|
|
61
66
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
62
|
-
self.original_inv_freq =
|
|
67
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
63
68
|
|
|
64
69
|
@staticmethod
|
|
65
70
|
def compute_default_rope_parameters(
|
|
@@ -327,6 +332,7 @@ class PhimoeMultiplier(torch.autograd.Function):
|
|
|
327
332
|
)
|
|
328
333
|
|
|
329
334
|
|
|
335
|
+
@use_experts_implementation
|
|
330
336
|
class PhimoeExperts(nn.Module):
|
|
331
337
|
"""Collection of expert weights stored as 3D tensors."""
|
|
332
338
|
|
|
@@ -617,7 +623,9 @@ class PhimoePreTrainedModel(PreTrainedModel):
|
|
|
617
623
|
_supports_flash_attn = True
|
|
618
624
|
_supports_sdpa = True
|
|
619
625
|
_supports_flex_attn = True
|
|
620
|
-
_can_compile_fullgraph =
|
|
626
|
+
_can_compile_fullgraph = (
|
|
627
|
+
is_grouped_mm_available()
|
|
628
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
621
629
|
_supports_attention_backend = True
|
|
622
630
|
_can_record_outputs = {
|
|
623
631
|
"router_logits": OutputRecorder(PhimoeTopKRouter, layer_name="mlp.router", index=0),
|
|
@@ -52,7 +52,7 @@ class PhimoeRotaryEmbedding(MixtralRotaryEmbedding):
|
|
|
52
52
|
inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
|
|
53
53
|
|
|
54
54
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
55
|
-
self.original_inv_freq =
|
|
55
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
56
56
|
|
|
57
57
|
def forward(self, x, position_ids=None, layer_type=None):
|
|
58
58
|
if layer_type is not None:
|
|
@@ -61,10 +61,6 @@ class Pix2StructProcessor(ProcessorMixin):
|
|
|
61
61
|
An instance of ['T5Tokenizer`]. The tokenizer is a required input.
|
|
62
62
|
"""
|
|
63
63
|
|
|
64
|
-
attributes = ["image_processor", "tokenizer"]
|
|
65
|
-
image_processor_class = "Pix2StructImageProcessor"
|
|
66
|
-
tokenizer_class = ("T5Tokenizer",)
|
|
67
|
-
|
|
68
64
|
def __init__(self, image_processor, tokenizer):
|
|
69
65
|
tokenizer.return_token_type_ids = False
|
|
70
66
|
super().__init__(image_processor, tokenizer)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 Meta AI and The HuggingFace Inc. team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""Pixio model configuration"""
|
|
16
|
+
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
|
+
|
|
19
|
+
from ...utils import _LazyModule
|
|
20
|
+
from ...utils.import_utils import define_import_structure
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from .configuration_pixio import *
|
|
25
|
+
from .modeling_pixio import *
|
|
26
|
+
else:
|
|
27
|
+
import sys
|
|
28
|
+
|
|
29
|
+
_file = globals()["__file__"]
|
|
30
|
+
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|