transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +20 -1
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +68 -5
- transformers/core_model_loading.py +201 -35
- transformers/dependency_versions_table.py +1 -1
- transformers/feature_extraction_utils.py +54 -22
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +162 -122
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +101 -64
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +2 -12
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +12 -0
- transformers/integrations/accelerate.py +44 -111
- transformers/integrations/aqlm.py +3 -5
- transformers/integrations/awq.py +2 -5
- transformers/integrations/bitnet.py +5 -8
- transformers/integrations/bitsandbytes.py +16 -15
- transformers/integrations/deepspeed.py +18 -3
- transformers/integrations/eetq.py +3 -5
- transformers/integrations/fbgemm_fp8.py +1 -1
- transformers/integrations/finegrained_fp8.py +6 -16
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/higgs.py +2 -5
- transformers/integrations/hub_kernels.py +23 -5
- transformers/integrations/integration_utils.py +35 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +4 -10
- transformers/integrations/peft.py +5 -0
- transformers/integrations/quanto.py +5 -2
- transformers/integrations/spqr.py +3 -5
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/vptq.py +3 -5
- transformers/modeling_gguf_pytorch_utils.py +66 -19
- transformers/modeling_rope_utils.py +78 -81
- transformers/modeling_utils.py +583 -503
- transformers/models/__init__.py +19 -0
- transformers/models/afmoe/modeling_afmoe.py +7 -16
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/align/modeling_align.py +12 -6
- transformers/models/altclip/modeling_altclip.py +7 -3
- transformers/models/apertus/modeling_apertus.py +4 -2
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +8 -4
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +27 -0
- transformers/models/auto/feature_extraction_auto.py +7 -3
- transformers/models/auto/image_processing_auto.py +4 -2
- transformers/models/auto/modeling_auto.py +31 -0
- transformers/models/auto/processing_auto.py +4 -0
- transformers/models/auto/tokenization_auto.py +132 -153
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +18 -19
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +9 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +3 -0
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
- transformers/models/bit/modeling_bit.py +5 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blenderbot/modeling_blenderbot.py +7 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +8 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -0
- transformers/models/bloom/modeling_bloom.py +13 -44
- transformers/models/blt/modeling_blt.py +162 -2
- transformers/models/blt/modular_blt.py +168 -3
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +6 -0
- transformers/models/bros/modeling_bros.py +8 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/canine/modeling_canine.py +6 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +9 -4
- transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +25 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clipseg/modeling_clipseg.py +4 -0
- transformers/models/clvp/modeling_clvp.py +14 -3
- transformers/models/code_llama/tokenization_code_llama.py +1 -1
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
- transformers/models/convbert/modeling_convbert.py +3 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +3 -1
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +14 -2
- transformers/models/cvt/modeling_cvt.py +5 -1
- transformers/models/cwm/modeling_cwm.py +1 -1
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +46 -39
- transformers/models/d_fine/modular_d_fine.py +15 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +1 -1
- transformers/models/dac/modeling_dac.py +4 -4
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +1 -1
- transformers/models/deberta/modeling_deberta.py +2 -0
- transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
- transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
- transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +8 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +12 -1
- transformers/models/dia/modular_dia.py +11 -0
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +3 -3
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
- transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +16 -12
- transformers/models/dots1/modeling_dots1.py +14 -5
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +5 -2
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +5 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +8 -2
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt_fast.py +46 -14
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +6 -1
- transformers/models/evolla/modeling_evolla.py +9 -1
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/falcon/modeling_falcon.py +3 -3
- transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
- transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
- transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +14 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +4 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
- transformers/models/florence2/modeling_florence2.py +20 -3
- transformers/models/florence2/modular_florence2.py +13 -0
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +16 -0
- transformers/models/gemma/modeling_gemma.py +10 -12
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma2/modular_gemma2.py +1 -1
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +28 -7
- transformers/models/gemma3/modular_gemma3.py +26 -6
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +47 -9
- transformers/models/gemma3n/modular_gemma3n.py +51 -9
- transformers/models/git/modeling_git.py +181 -126
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +15 -5
- transformers/models/glm4v/modular_glm4v.py +11 -3
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
- transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +8 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
- transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
- transformers/models/gptj/modeling_gptj.py +15 -6
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +2 -3
- transformers/models/granitemoe/modular_granitemoe.py +1 -2
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
- transformers/models/groupvit/modeling_groupvit.py +6 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
- transformers/models/hubert/modeling_hubert.py +4 -0
- transformers/models/hubert/modular_hubert.py +4 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +16 -0
- transformers/models/idefics/modeling_idefics.py +10 -0
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +9 -2
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +11 -8
- transformers/models/internvl/modular_internvl.py +5 -9
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +24 -19
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +15 -7
- transformers/models/janus/modular_janus.py +16 -7
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +14 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/configuration_lasr.py +4 -0
- transformers/models/lasr/modeling_lasr.py +3 -2
- transformers/models/lasr/modular_lasr.py +8 -1
- transformers/models/lasr/processing_lasr.py +0 -2
- transformers/models/layoutlm/modeling_layoutlm.py +5 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +18 -0
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lilt/modeling_lilt.py +19 -15
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +8 -4
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
- transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
- transformers/models/longt5/modeling_longt5.py +0 -4
- transformers/models/m2m_100/modeling_m2m_100.py +10 -0
- transformers/models/mamba/modeling_mamba.py +2 -1
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +3 -0
- transformers/models/markuplm/modeling_markuplm.py +5 -8
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +9 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +9 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mimi/modeling_mimi.py +25 -4
- transformers/models/minimax/modeling_minimax.py +16 -3
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +12 -4
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +13 -2
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +4 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
- transformers/models/modernbert/modeling_modernbert.py +12 -1
- transformers/models/modernbert/modular_modernbert.py +12 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
- transformers/models/moonshine/modeling_moonshine.py +1 -1
- transformers/models/moshi/modeling_moshi.py +21 -51
- transformers/models/mpnet/modeling_mpnet.py +2 -0
- transformers/models/mra/modeling_mra.py +4 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +0 -10
- transformers/models/musicgen/modeling_musicgen.py +5 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +1 -1
- transformers/models/nemotron/modeling_nemotron.py +3 -3
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +11 -16
- transformers/models/nystromformer/modeling_nystromformer.py +7 -0
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/olmoe/modeling_olmoe.py +12 -4
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +7 -38
- transformers/models/openai/modeling_openai.py +12 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +7 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +7 -3
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/modeling_parakeet.py +5 -0
- transformers/models/parakeet/modular_parakeet.py +5 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
- transformers/models/patchtst/modeling_patchtst.py +5 -4
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/models/pe_audio/processing_pe_audio.py +24 -0
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +3 -0
- transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +5 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +12 -4
- transformers/models/phimoe/modular_phimoe.py +1 -1
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +1 -1
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +7 -0
- transformers/models/plbart/modular_plbart.py +6 -0
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +11 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prophetnet/modeling_prophetnet.py +2 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
- transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +7 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
- transformers/models/reformer/modeling_reformer.py +9 -1
- transformers/models/regnet/modeling_regnet.py +4 -0
- transformers/models/rembert/modeling_rembert.py +7 -1
- transformers/models/resnet/modeling_resnet.py +8 -3
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +4 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +1 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +5 -1
- transformers/models/sam2/modular_sam2.py +5 -1
- transformers/models/sam2_video/modeling_sam2_video.py +51 -43
- transformers/models/sam2_video/modular_sam2_video.py +31 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +23 -0
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +3 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +2 -2
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +63 -41
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
- transformers/models/speecht5/modeling_speecht5.py +28 -0
- transformers/models/splinter/modeling_splinter.py +9 -3
- transformers/models/squeezebert/modeling_squeezebert.py +2 -0
- transformers/models/stablelm/modeling_stablelm.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/swiftformer/modeling_swiftformer.py +4 -0
- transformers/models/swin/modeling_swin.py +16 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +49 -33
- transformers/models/swinv2/modeling_swinv2.py +41 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +1 -7
- transformers/models/t5gemma/modeling_t5gemma.py +1 -1
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +1 -1
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +12 -0
- transformers/models/timesfm/modular_timesfm.py +12 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
- transformers/models/trocr/modeling_trocr.py +1 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +4 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +3 -7
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +0 -6
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +7 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/visual_bert/modeling_visual_bert.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +4 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +5 -3
- transformers/models/x_clip/modeling_x_clip.py +2 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +10 -0
- transformers/models/xlm/modeling_xlm.py +13 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +4 -1
- transformers/models/zamba/modeling_zamba.py +2 -1
- transformers/models/zamba2/modeling_zamba2.py +3 -2
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +7 -0
- transformers/pipelines/__init__.py +9 -6
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/document_question_answering.py +1 -1
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +127 -56
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +9 -64
- transformers/quantizers/quantizer_aqlm.py +1 -18
- transformers/quantizers/quantizer_auto_round.py +1 -10
- transformers/quantizers/quantizer_awq.py +3 -8
- transformers/quantizers/quantizer_bitnet.py +1 -6
- transformers/quantizers/quantizer_bnb_4bit.py +9 -49
- transformers/quantizers/quantizer_bnb_8bit.py +9 -19
- transformers/quantizers/quantizer_compressed_tensors.py +1 -4
- transformers/quantizers/quantizer_eetq.py +2 -12
- transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
- transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
- transformers/quantizers/quantizer_fp_quant.py +4 -4
- transformers/quantizers/quantizer_gptq.py +1 -4
- transformers/quantizers/quantizer_higgs.py +2 -6
- transformers/quantizers/quantizer_mxfp4.py +2 -28
- transformers/quantizers/quantizer_quanto.py +14 -14
- transformers/quantizers/quantizer_spqr.py +3 -8
- transformers/quantizers/quantizer_torchao.py +28 -124
- transformers/quantizers/quantizer_vptq.py +1 -10
- transformers/testing_utils.py +28 -12
- transformers/tokenization_mistral_common.py +3 -2
- transformers/tokenization_utils_base.py +3 -2
- transformers/tokenization_utils_tokenizers.py +25 -2
- transformers/trainer.py +24 -2
- transformers/trainer_callback.py +8 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/training_args.py +8 -10
- transformers/utils/__init__.py +4 -0
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +34 -25
- transformers/utils/generic.py +20 -0
- transformers/utils/import_utils.py +51 -9
- transformers/utils/kernel_config.py +71 -18
- transformers/utils/quantization_config.py +8 -8
- transformers/video_processing_utils.py +16 -12
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -25,7 +25,7 @@ from ... import initialization as init
|
|
|
25
25
|
from ...cache_utils import Cache
|
|
26
26
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
27
27
|
from ...processing_utils import Unpack
|
|
28
|
-
from ...utils import TransformersKwargs, logging
|
|
28
|
+
from ...utils import TransformersKwargs, is_grouped_mm_available, logging
|
|
29
29
|
from ..hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1RotaryEmbedding
|
|
30
30
|
from ..llama.modeling_llama import (
|
|
31
31
|
LlamaAttention,
|
|
@@ -177,7 +177,9 @@ class HunYuanMoEV1DecoderLayer(LlamaDecoderLayer):
|
|
|
177
177
|
|
|
178
178
|
|
|
179
179
|
class HunYuanMoEV1PreTrainedModel(LlamaPreTrainedModel):
|
|
180
|
-
_can_compile_fullgraph =
|
|
180
|
+
_can_compile_fullgraph = (
|
|
181
|
+
is_grouped_mm_available()
|
|
182
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
181
183
|
|
|
182
184
|
@torch.no_grad()
|
|
183
185
|
def _init_weights(self, module):
|
|
@@ -593,16 +593,32 @@ class IBertPreTrainedModel(PreTrainedModel):
|
|
|
593
593
|
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
|
|
594
594
|
if module.bias is not None:
|
|
595
595
|
init.zeros_(module.bias)
|
|
596
|
+
if getattr(module, "weight_integer", None) is not None:
|
|
597
|
+
init.zeros_(module.weight_integer)
|
|
598
|
+
init.zeros_(module.fc_scaling_factor)
|
|
599
|
+
if getattr(module, "bias_integer", None) is not None:
|
|
600
|
+
init.zeros_(module.bias_integer)
|
|
596
601
|
elif isinstance(module, (QuantEmbedding, nn.Embedding)):
|
|
597
602
|
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
|
|
598
603
|
# Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
|
|
599
604
|
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
|
|
600
605
|
init.zeros_(module.weight[module.padding_idx])
|
|
606
|
+
if getattr(module, "weight_scaling_factor", None) is not None:
|
|
607
|
+
init.zeros_(module.weight_scaling_factor)
|
|
608
|
+
init.zeros_(module.weight_integer)
|
|
601
609
|
elif isinstance(module, (IntLayerNorm, nn.LayerNorm)):
|
|
602
610
|
init.zeros_(module.bias)
|
|
603
611
|
init.ones_(module.weight)
|
|
612
|
+
if getattr(module, "shift", None) is not None:
|
|
613
|
+
init.zeros_(module.shift)
|
|
604
614
|
elif isinstance(module, IBertLMHead):
|
|
605
615
|
init.zeros_(module.bias)
|
|
616
|
+
elif isinstance(module, IBertEmbeddings):
|
|
617
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
618
|
+
elif isinstance(module, QuantAct):
|
|
619
|
+
init.constant_(module.x_min, -1e-5)
|
|
620
|
+
init.constant_(module.x_max, 1e-5)
|
|
621
|
+
init.zeros_(module.act_scaling_factor)
|
|
606
622
|
|
|
607
623
|
def resize_token_embeddings(self, new_num_tokens=None):
|
|
608
624
|
raise NotImplementedError("`resize_token_embeddings` is not supported for I-BERT.")
|
|
@@ -840,6 +840,7 @@ class IdeficsPreTrainedModel(PreTrainedModel):
|
|
|
840
840
|
super()._init_weights(module)
|
|
841
841
|
if isinstance(module, IdeficsVisionEmbeddings):
|
|
842
842
|
init.normal_(module.class_embedding)
|
|
843
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
843
844
|
elif isinstance(module, IdeficsGatedCrossAttentionLayer):
|
|
844
845
|
if self.config.alpha_initializer == "zeros":
|
|
845
846
|
init.zeros_(module.alpha_cross_attn)
|
|
@@ -852,6 +853,15 @@ class IdeficsPreTrainedModel(PreTrainedModel):
|
|
|
852
853
|
init.normal_(module.alpha_dense, mean=0.0, std=self.config.alphas_initializer_range)
|
|
853
854
|
elif isinstance(module, IdeficsPerceiverResampler):
|
|
854
855
|
init.normal_(module.latents)
|
|
856
|
+
elif isinstance(module, IdeficsEmbedding):
|
|
857
|
+
inv_freq = 1.0 / (module.base ** (torch.arange(0, module.dim, 2) / module.dim))
|
|
858
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
859
|
+
t = torch.arange(module.max_position_embeddings).type_as(inv_freq)
|
|
860
|
+
freqs = torch.einsum("i,j->ij", t, inv_freq)
|
|
861
|
+
# Different from paper, but it uses a different permutation in order to obtain the same calculation
|
|
862
|
+
emb = torch.cat((freqs, freqs), dim=-1)
|
|
863
|
+
init.copy_(module.cos_cached, emb.cos())
|
|
864
|
+
init.copy_(module.sin_cached, emb.sin())
|
|
855
865
|
|
|
856
866
|
|
|
857
867
|
@auto_docstring
|
|
@@ -452,6 +452,8 @@ class Idefics2VisionTransformer(Idefics2PreTrainedModel):
|
|
|
452
452
|
self.encoder = Idefics2Encoder(config)
|
|
453
453
|
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
|
454
454
|
|
|
455
|
+
self.post_init()
|
|
456
|
+
|
|
455
457
|
def get_input_embeddings(self):
|
|
456
458
|
return self.embeddings
|
|
457
459
|
|
|
@@ -711,6 +713,8 @@ class Idefics2PerceiverResampler(Idefics2PreTrainedModel):
|
|
|
711
713
|
self.layers = nn.ModuleList([Idefics2PerceiverLayer(config, idx) for idx in range(self.depth)])
|
|
712
714
|
self.norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
|
|
713
715
|
|
|
716
|
+
self.post_init()
|
|
717
|
+
|
|
714
718
|
@auto_docstring
|
|
715
719
|
def forward(
|
|
716
720
|
self,
|
|
@@ -1115,6 +1119,7 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
|
|
|
1115
1119
|
pixel_attention_mask=None,
|
|
1116
1120
|
image_hidden_states=None,
|
|
1117
1121
|
logits_to_keep=None,
|
|
1122
|
+
is_first_iteration=False,
|
|
1118
1123
|
**kwargs,
|
|
1119
1124
|
):
|
|
1120
1125
|
# Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
|
|
@@ -1130,10 +1135,11 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
|
|
|
1130
1135
|
pixel_attention_mask=pixel_attention_mask,
|
|
1131
1136
|
image_hidden_states=image_hidden_states,
|
|
1132
1137
|
logits_to_keep=logits_to_keep,
|
|
1138
|
+
is_first_iteration=is_first_iteration,
|
|
1133
1139
|
**kwargs,
|
|
1134
1140
|
)
|
|
1135
1141
|
|
|
1136
|
-
if image_hidden_states is not None or
|
|
1142
|
+
if image_hidden_states is not None or not is_first_iteration:
|
|
1137
1143
|
model_inputs["pixel_values"] = None
|
|
1138
1144
|
model_inputs["pixel_attention_mask"] = None
|
|
1139
1145
|
|
|
@@ -458,6 +458,8 @@ class Idefics3VisionTransformer(Idefics3PreTrainedModel):
|
|
|
458
458
|
self.patch_size = config.patch_size
|
|
459
459
|
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
|
460
460
|
|
|
461
|
+
self.post_init()
|
|
462
|
+
|
|
461
463
|
# Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionTransformer.get_input_embeddings
|
|
462
464
|
def get_input_embeddings(self):
|
|
463
465
|
return self.embeddings
|
|
@@ -887,6 +889,7 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
|
|
|
887
889
|
pixel_attention_mask=None,
|
|
888
890
|
image_hidden_states=None,
|
|
889
891
|
logits_to_keep=None,
|
|
892
|
+
is_first_iteration=False,
|
|
890
893
|
**kwargs,
|
|
891
894
|
):
|
|
892
895
|
# Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
|
|
@@ -902,10 +905,11 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
|
|
|
902
905
|
pixel_attention_mask=pixel_attention_mask,
|
|
903
906
|
image_hidden_states=image_hidden_states,
|
|
904
907
|
logits_to_keep=logits_to_keep,
|
|
908
|
+
is_first_iteration=is_first_iteration,
|
|
905
909
|
**kwargs,
|
|
906
910
|
)
|
|
907
911
|
|
|
908
|
-
if image_hidden_states is not None or
|
|
912
|
+
if image_hidden_states is not None or not is_first_iteration:
|
|
909
913
|
model_inputs["pixel_values"] = None
|
|
910
914
|
model_inputs["pixel_attention_mask"] = None
|
|
911
915
|
|
|
@@ -164,12 +164,8 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
|
|
|
164
164
|
|
|
165
165
|
input_ids = reorder_images(input_ids_grouped, grouped_images_index)
|
|
166
166
|
|
|
167
|
-
return BatchFeature(
|
|
168
|
-
data={"input_ids": torch.stack(input_ids, dim=0) if return_tensors else input_ids},
|
|
169
|
-
tensor_type=return_tensors,
|
|
170
|
-
)
|
|
167
|
+
return BatchFeature(data={"input_ids": input_ids}, tensor_type=return_tensors)
|
|
171
168
|
|
|
172
|
-
pixel_values = torch.stack(pixel_values, dim=0) if return_tensors else pixel_values
|
|
173
169
|
return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
|
|
174
170
|
|
|
175
171
|
def to_dict(self):
|
|
@@ -61,7 +61,7 @@ class ImageGPTLayerNorm(nn.Module):
|
|
|
61
61
|
class ImageGPTAttention(nn.Module):
|
|
62
62
|
def __init__(self, config, is_cross_attention: Optional[bool] = False, layer_idx: Optional[int] = None):
|
|
63
63
|
super().__init__()
|
|
64
|
-
|
|
64
|
+
self.config = config
|
|
65
65
|
max_positions = config.max_position_embeddings
|
|
66
66
|
self.register_buffer(
|
|
67
67
|
"bias",
|
|
@@ -70,7 +70,6 @@ class ImageGPTAttention(nn.Module):
|
|
|
70
70
|
),
|
|
71
71
|
persistent=False,
|
|
72
72
|
)
|
|
73
|
-
self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
|
|
74
73
|
|
|
75
74
|
self.embed_dim = config.hidden_size
|
|
76
75
|
self.num_heads = config.num_attention_heads
|
|
@@ -384,6 +383,14 @@ class ImageGPTPreTrainedModel(PreTrainedModel):
|
|
|
384
383
|
if "c_proj" in name and "weight" in name:
|
|
385
384
|
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
|
386
385
|
init.normal_(p, mean=0.0, std=self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
|
|
386
|
+
elif isinstance(module, ImageGPTAttention):
|
|
387
|
+
max_positions = module.config.max_position_embeddings
|
|
388
|
+
init.copy_(
|
|
389
|
+
module.bias,
|
|
390
|
+
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
|
|
391
|
+
1, 1, max_positions, max_positions
|
|
392
|
+
),
|
|
393
|
+
)
|
|
387
394
|
|
|
388
395
|
|
|
389
396
|
@auto_docstring
|
|
@@ -335,6 +335,8 @@ class InstructBlipPreTrainedModel(PreTrainedModel):
|
|
|
335
335
|
init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
|
|
336
336
|
elif isinstance(module, (InstructBlipForConditionalGeneration, InstructBlipModel)):
|
|
337
337
|
init.zeros_(module.query_tokens)
|
|
338
|
+
elif isinstance(module, InstructBlipQFormerEmbeddings):
|
|
339
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
338
340
|
|
|
339
341
|
|
|
340
342
|
# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->InstructBlip
|
|
@@ -128,6 +128,56 @@ class InstructBlipVideoVisionEmbeddings(nn.Module):
|
|
|
128
128
|
return embeddings
|
|
129
129
|
|
|
130
130
|
|
|
131
|
+
class InstructBlipVideoQFormerEmbeddings(nn.Module):
|
|
132
|
+
"""Construct the embeddings from word and position embeddings."""
|
|
133
|
+
|
|
134
|
+
def __init__(self, config):
|
|
135
|
+
super().__init__()
|
|
136
|
+
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
|
|
137
|
+
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
|
138
|
+
|
|
139
|
+
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
140
|
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
|
141
|
+
|
|
142
|
+
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
|
143
|
+
self.register_buffer(
|
|
144
|
+
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
self.config = config
|
|
148
|
+
|
|
149
|
+
def forward(
|
|
150
|
+
self,
|
|
151
|
+
input_ids=None,
|
|
152
|
+
position_ids=None,
|
|
153
|
+
query_embeds=None,
|
|
154
|
+
past_key_values_length=0,
|
|
155
|
+
):
|
|
156
|
+
if input_ids is not None:
|
|
157
|
+
seq_length = input_ids.size()[1]
|
|
158
|
+
else:
|
|
159
|
+
seq_length = 0
|
|
160
|
+
|
|
161
|
+
if position_ids is None:
|
|
162
|
+
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
|
|
163
|
+
|
|
164
|
+
if input_ids is not None:
|
|
165
|
+
embeddings = self.word_embeddings(input_ids)
|
|
166
|
+
|
|
167
|
+
position_embeddings = self.position_embeddings(position_ids.to(embeddings.device))
|
|
168
|
+
embeddings = embeddings + position_embeddings
|
|
169
|
+
|
|
170
|
+
if query_embeds is not None:
|
|
171
|
+
embeddings = torch.cat((query_embeds, embeddings), dim=1)
|
|
172
|
+
else:
|
|
173
|
+
embeddings = query_embeds
|
|
174
|
+
|
|
175
|
+
embeddings = embeddings.to(self.layernorm.weight.dtype)
|
|
176
|
+
embeddings = self.layernorm(embeddings)
|
|
177
|
+
embeddings = self.dropout(embeddings)
|
|
178
|
+
return embeddings
|
|
179
|
+
|
|
180
|
+
|
|
131
181
|
@auto_docstring
|
|
132
182
|
class InstructBlipVideoPreTrainedModel(PreTrainedModel):
|
|
133
183
|
config: InstructBlipVideoConfig
|
|
@@ -158,6 +208,8 @@ class InstructBlipVideoPreTrainedModel(PreTrainedModel):
|
|
|
158
208
|
init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
|
|
159
209
|
elif isinstance(module, (InstructBlipVideoForConditionalGeneration, InstructBlipVideoModel)):
|
|
160
210
|
init.zeros_(module.query_tokens)
|
|
211
|
+
elif isinstance(module, InstructBlipVideoQFormerEmbeddings):
|
|
212
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
161
213
|
|
|
162
214
|
|
|
163
215
|
# Adapted from transformers.models.siglip.modeling_siglip.eager_attention_forward -> InstructBlipVideo doesn't cast attn weights to fp32
|
|
@@ -677,56 +729,6 @@ class InstructBlipVideoQFormerEncoder(nn.Module):
|
|
|
677
729
|
)
|
|
678
730
|
|
|
679
731
|
|
|
680
|
-
class InstructBlipVideoQFormerEmbeddings(nn.Module):
|
|
681
|
-
"""Construct the embeddings from word and position embeddings."""
|
|
682
|
-
|
|
683
|
-
def __init__(self, config):
|
|
684
|
-
super().__init__()
|
|
685
|
-
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
|
|
686
|
-
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
|
687
|
-
|
|
688
|
-
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
689
|
-
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
|
690
|
-
|
|
691
|
-
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
|
692
|
-
self.register_buffer(
|
|
693
|
-
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
|
694
|
-
)
|
|
695
|
-
|
|
696
|
-
self.config = config
|
|
697
|
-
|
|
698
|
-
def forward(
|
|
699
|
-
self,
|
|
700
|
-
input_ids=None,
|
|
701
|
-
position_ids=None,
|
|
702
|
-
query_embeds=None,
|
|
703
|
-
past_key_values_length=0,
|
|
704
|
-
):
|
|
705
|
-
if input_ids is not None:
|
|
706
|
-
seq_length = input_ids.size()[1]
|
|
707
|
-
else:
|
|
708
|
-
seq_length = 0
|
|
709
|
-
|
|
710
|
-
if position_ids is None:
|
|
711
|
-
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
|
|
712
|
-
|
|
713
|
-
if input_ids is not None:
|
|
714
|
-
embeddings = self.word_embeddings(input_ids)
|
|
715
|
-
|
|
716
|
-
position_embeddings = self.position_embeddings(position_ids.to(embeddings.device))
|
|
717
|
-
embeddings = embeddings + position_embeddings
|
|
718
|
-
|
|
719
|
-
if query_embeds is not None:
|
|
720
|
-
embeddings = torch.cat((query_embeds, embeddings), dim=1)
|
|
721
|
-
else:
|
|
722
|
-
embeddings = query_embeds
|
|
723
|
-
|
|
724
|
-
embeddings = embeddings.to(self.layernorm.weight.dtype)
|
|
725
|
-
embeddings = self.layernorm(embeddings)
|
|
726
|
-
embeddings = self.dropout(embeddings)
|
|
727
|
-
return embeddings
|
|
728
|
-
|
|
729
|
-
|
|
730
732
|
class InstructBlipVideoQFormerModel(InstructBlipVideoPreTrainedModel):
|
|
731
733
|
"""
|
|
732
734
|
Querying Transformer (Q-Former), used in InstructBlipVideo. Slightly modified from BLIP-2 as it also takes the
|
|
@@ -84,7 +84,6 @@ class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
|
|
|
84
84
|
processed_videos_grouped[shape] = stacked_videos
|
|
85
85
|
|
|
86
86
|
processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
|
|
87
|
-
processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
|
|
88
87
|
|
|
89
88
|
return BatchFeature(data={"pixel_values": processed_videos}, tensor_type=return_tensors)
|
|
90
89
|
|
|
@@ -209,10 +209,9 @@ class InternVLVisionPatchEmbeddings(nn.Module):
|
|
|
209
209
|
)
|
|
210
210
|
|
|
211
211
|
embeddings = self.projection(pixel_values.to(self.projection.weight.dtype))
|
|
212
|
-
patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
|
|
213
212
|
embeddings = embeddings.flatten(2).transpose(1, 2)
|
|
214
213
|
|
|
215
|
-
return embeddings
|
|
214
|
+
return embeddings
|
|
216
215
|
|
|
217
216
|
|
|
218
217
|
# Based on timm implementation, which can be found here:
|
|
@@ -291,7 +290,7 @@ class InternVLVisionEmbeddings(nn.Module):
|
|
|
291
290
|
bool_masked_pos: Optional[torch.BoolTensor] = None,
|
|
292
291
|
) -> torch.Tensor:
|
|
293
292
|
_, _, height, width = pixel_values.shape
|
|
294
|
-
embeddings
|
|
293
|
+
embeddings = self.patch_embeddings(pixel_values)
|
|
295
294
|
batch_size, seq_len, _ = embeddings.size()
|
|
296
295
|
|
|
297
296
|
if bool_masked_pos is not None:
|
|
@@ -308,7 +307,7 @@ class InternVLVisionEmbeddings(nn.Module):
|
|
|
308
307
|
|
|
309
308
|
embeddings = self.dropout(embeddings)
|
|
310
309
|
|
|
311
|
-
return embeddings
|
|
310
|
+
return embeddings
|
|
312
311
|
|
|
313
312
|
|
|
314
313
|
class InternVLVisionMLP(nn.Module):
|
|
@@ -455,7 +454,7 @@ class InternVLVisionModel(InternVLVisionPreTrainedModel):
|
|
|
455
454
|
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
|
|
456
455
|
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
|
|
457
456
|
"""
|
|
458
|
-
embedding_output
|
|
457
|
+
embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
|
|
459
458
|
|
|
460
459
|
encoder_outputs = self.encoder(embedding_output)
|
|
461
460
|
sequence_output = encoder_outputs[0]
|
|
@@ -898,6 +897,7 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
|
|
|
898
897
|
attention_mask=None,
|
|
899
898
|
cache_position=None,
|
|
900
899
|
logits_to_keep=None,
|
|
900
|
+
is_first_iteration=False,
|
|
901
901
|
**kwargs,
|
|
902
902
|
):
|
|
903
903
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -909,12 +909,15 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
|
|
|
909
909
|
attention_mask=attention_mask,
|
|
910
910
|
cache_position=cache_position,
|
|
911
911
|
logits_to_keep=logits_to_keep,
|
|
912
|
+
is_first_iteration=is_first_iteration,
|
|
912
913
|
**kwargs,
|
|
913
914
|
)
|
|
914
915
|
|
|
915
|
-
if
|
|
916
|
-
#
|
|
917
|
-
#
|
|
916
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
917
|
+
# Pixel values are used only in the first iteration if available
|
|
918
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
919
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
920
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
918
921
|
model_inputs["pixel_values"] = pixel_values
|
|
919
922
|
|
|
920
923
|
return model_inputs
|
|
@@ -29,7 +29,7 @@ from ...modeling_layers import GradientCheckpointingLayer
|
|
|
29
29
|
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
|
|
30
30
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
31
31
|
from ...processing_utils import Unpack
|
|
32
|
-
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple,
|
|
32
|
+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_int
|
|
33
33
|
from ...utils.generic import check_model_inputs
|
|
34
34
|
from ..clip.modeling_clip import CLIPMLP
|
|
35
35
|
from ..janus.modeling_janus import JanusVisionAttention
|
|
@@ -44,9 +44,6 @@ from ..llava.modeling_llava import (
|
|
|
44
44
|
from .configuration_internvl import InternVLConfig, InternVLVisionConfig
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
logger = logging.get_logger(__name__)
|
|
48
|
-
|
|
49
|
-
|
|
50
47
|
def eager_attention_forward(
|
|
51
48
|
module: nn.Module,
|
|
52
49
|
query: torch.Tensor,
|
|
@@ -177,10 +174,9 @@ class InternVLVisionPatchEmbeddings(nn.Module):
|
|
|
177
174
|
)
|
|
178
175
|
|
|
179
176
|
embeddings = self.projection(pixel_values.to(self.projection.weight.dtype))
|
|
180
|
-
patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
|
|
181
177
|
embeddings = embeddings.flatten(2).transpose(1, 2)
|
|
182
178
|
|
|
183
|
-
return embeddings
|
|
179
|
+
return embeddings
|
|
184
180
|
|
|
185
181
|
|
|
186
182
|
# Based on timm implementation, which can be found here:
|
|
@@ -259,7 +255,7 @@ class InternVLVisionEmbeddings(nn.Module):
|
|
|
259
255
|
bool_masked_pos: Optional[torch.BoolTensor] = None,
|
|
260
256
|
) -> torch.Tensor:
|
|
261
257
|
_, _, height, width = pixel_values.shape
|
|
262
|
-
embeddings
|
|
258
|
+
embeddings = self.patch_embeddings(pixel_values)
|
|
263
259
|
batch_size, seq_len, _ = embeddings.size()
|
|
264
260
|
|
|
265
261
|
if bool_masked_pos is not None:
|
|
@@ -276,7 +272,7 @@ class InternVLVisionEmbeddings(nn.Module):
|
|
|
276
272
|
|
|
277
273
|
embeddings = self.dropout(embeddings)
|
|
278
274
|
|
|
279
|
-
return embeddings
|
|
275
|
+
return embeddings
|
|
280
276
|
|
|
281
277
|
|
|
282
278
|
class InternVLVisionMLP(CLIPMLP):
|
|
@@ -412,7 +408,7 @@ class InternVLVisionModel(InternVLVisionPreTrainedModel):
|
|
|
412
408
|
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
|
|
413
409
|
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
|
|
414
410
|
"""
|
|
415
|
-
embedding_output
|
|
411
|
+
embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
|
|
416
412
|
|
|
417
413
|
encoder_outputs = self.encoder(embedding_output)
|
|
418
414
|
sequence_output = encoder_outputs[0]
|
|
@@ -140,7 +140,6 @@ class InternVLVideoProcessor(BaseVideoProcessor):
|
|
|
140
140
|
processed_videos_grouped[shape] = stacked_videos
|
|
141
141
|
|
|
142
142
|
processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
|
|
143
|
-
processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
|
|
144
143
|
|
|
145
144
|
return BatchFeature(data={"pixel_values_videos": processed_videos}, tensor_type=return_tensors)
|
|
146
145
|
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
|
|
16
|
+
from ...utils import _LazyModule
|
|
17
|
+
from ...utils.import_utils import define_import_structure
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from .configuration_jais2 import *
|
|
22
|
+
from .modeling_jais2 import *
|
|
23
|
+
else:
|
|
24
|
+
import sys
|
|
25
|
+
|
|
26
|
+
_file = globals()["__file__"]
|
|
27
|
+
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
2
|
+
# This file was automatically generated from src/transformers/models/jais2/modular_jais2.py.
|
|
3
|
+
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
|
4
|
+
# the file from the modular. If any change should be done, please apply the change to the
|
|
5
|
+
# modular_jais2.py file directly. One of our CI enforces this.
|
|
6
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
7
|
+
# coding=utf-8
|
|
8
|
+
# Copyright 2025 the HuggingFace Team. All rights reserved.
|
|
9
|
+
#
|
|
10
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
11
|
+
# you may not use this file except in compliance with the License.
|
|
12
|
+
# You may obtain a copy of the License at
|
|
13
|
+
#
|
|
14
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
15
|
+
#
|
|
16
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
17
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
18
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
19
|
+
# See the License for the specific language governing permissions and
|
|
20
|
+
# limitations under the License.
|
|
21
|
+
|
|
22
|
+
from typing import Optional
|
|
23
|
+
|
|
24
|
+
from ...configuration_utils import PreTrainedConfig
|
|
25
|
+
from ...modeling_rope_utils import RopeParameters
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Jais2Config(PreTrainedConfig):
|
|
29
|
+
r"""
|
|
30
|
+
This is the configuration class to store the configuration of a [`Jais2Model`]. It is used to instantiate a Jais2
|
|
31
|
+
model according to the specified arguments, defining the model architecture.
|
|
32
|
+
[inceptionai/Jais-2-8B-Chat](https://huggingface.co/inceptionai/Jais-2-8B-Chat).
|
|
33
|
+
|
|
34
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
35
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
vocab_size (`int`, *optional*, defaults to 150272):
|
|
39
|
+
Vocabulary size of the Jais2 model.
|
|
40
|
+
hidden_size (`int`, *optional*, defaults to 3328):
|
|
41
|
+
Dimension of the hidden representations.
|
|
42
|
+
intermediate_size (`int`, *optional*, defaults to 26624):
|
|
43
|
+
Dimension of the MLP representations.
|
|
44
|
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
|
45
|
+
Number of hidden layers in the Transformer decoder.
|
|
46
|
+
num_attention_heads (`int`, *optional*, defaults to 26):
|
|
47
|
+
Number of attention heads for each attention layer.
|
|
48
|
+
num_key_value_heads (`int`, *optional*):
|
|
49
|
+
Number of key_value heads for Grouped Query Attention.
|
|
50
|
+
hidden_act (`str`, *optional*, defaults to `"relu2"`):
|
|
51
|
+
The non-linear activation function in the decoder.
|
|
52
|
+
max_position_embeddings (`int`, *optional*, defaults to 8192):
|
|
53
|
+
The maximum sequence length.
|
|
54
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
55
|
+
The standard deviation of the truncated_normal_initializer.
|
|
56
|
+
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
|
57
|
+
The epsilon used by the normalization layers.
|
|
58
|
+
use_cache (`bool`, *optional*, defaults to `True`):
|
|
59
|
+
Whether to return last key/values attentions.
|
|
60
|
+
pad_token_id (`int`, *optional*):
|
|
61
|
+
Padding token id.
|
|
62
|
+
bos_token_id (`int`, *optional*, defaults to 0):
|
|
63
|
+
Beginning of stream token id.
|
|
64
|
+
eos_token_id (`int`, *optional*, defaults to 150024):
|
|
65
|
+
End of stream token id.
|
|
66
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
67
|
+
Whether to tie weight embeddings.
|
|
68
|
+
attention_bias (`bool`, *optional*, defaults to `True`):
|
|
69
|
+
Whether to use a bias in the query, key, value and output projection layers.
|
|
70
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
71
|
+
The dropout ratio for the attention probabilities.
|
|
72
|
+
mlp_bias (`bool`, *optional*, defaults to `True`):
|
|
73
|
+
Whether to use a bias in up_proj, down_proj and gate_proj layers.
|
|
74
|
+
head_dim (`int`, *optional*):
|
|
75
|
+
The attention head dimension.
|
|
76
|
+
rope_parameters (`dict`, *optional*):
|
|
77
|
+
The RoPE parameters.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
model_type = "jais2"
|
|
81
|
+
keys_to_ignore_at_inference = ["past_key_values"]
|
|
82
|
+
|
|
83
|
+
base_model_tp_plan = {
|
|
84
|
+
"layers.*.self_attn.q_proj": "colwise",
|
|
85
|
+
"layers.*.self_attn.k_proj": "colwise",
|
|
86
|
+
"layers.*.self_attn.v_proj": "colwise",
|
|
87
|
+
"layers.*.self_attn.o_proj": "rowwise",
|
|
88
|
+
"layers.*.mlp.up_proj": "colwise",
|
|
89
|
+
"layers.*.mlp.down_proj": "rowwise",
|
|
90
|
+
}
|
|
91
|
+
base_model_pp_plan = {
|
|
92
|
+
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
|
93
|
+
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
|
94
|
+
"norm": (["hidden_states"], ["hidden_states"]),
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
vocab_size: Optional[int] = 150272,
|
|
100
|
+
hidden_size: Optional[int] = 3328,
|
|
101
|
+
intermediate_size: Optional[int] = 26624,
|
|
102
|
+
num_hidden_layers: Optional[int] = 32,
|
|
103
|
+
num_attention_heads: Optional[int] = 26,
|
|
104
|
+
num_key_value_heads: Optional[int] = None,
|
|
105
|
+
hidden_act: Optional[str] = "relu2",
|
|
106
|
+
max_position_embeddings: Optional[int] = 8192,
|
|
107
|
+
initializer_range: Optional[float] = 0.02,
|
|
108
|
+
layer_norm_eps: Optional[float] = 1e-5,
|
|
109
|
+
use_cache: Optional[bool] = True,
|
|
110
|
+
pad_token_id: Optional[int] = None,
|
|
111
|
+
bos_token_id: Optional[int] = 0,
|
|
112
|
+
eos_token_id: Optional[int] = 150024,
|
|
113
|
+
tie_word_embeddings: Optional[bool] = False,
|
|
114
|
+
attention_bias: Optional[bool] = True,
|
|
115
|
+
attention_dropout: Optional[float] = 0.0,
|
|
116
|
+
mlp_bias: Optional[bool] = True,
|
|
117
|
+
head_dim: Optional[int] = None,
|
|
118
|
+
rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
|
|
119
|
+
**kwargs,
|
|
120
|
+
):
|
|
121
|
+
self.vocab_size = vocab_size
|
|
122
|
+
self.max_position_embeddings = max_position_embeddings
|
|
123
|
+
self.hidden_size = hidden_size
|
|
124
|
+
self.intermediate_size = intermediate_size
|
|
125
|
+
self.num_hidden_layers = num_hidden_layers
|
|
126
|
+
self.num_attention_heads = num_attention_heads
|
|
127
|
+
|
|
128
|
+
# for backward compatibility
|
|
129
|
+
if num_key_value_heads is None:
|
|
130
|
+
num_key_value_heads = num_attention_heads
|
|
131
|
+
|
|
132
|
+
self.num_key_value_heads = num_key_value_heads
|
|
133
|
+
self.hidden_act = hidden_act
|
|
134
|
+
self.initializer_range = initializer_range
|
|
135
|
+
self.use_cache = use_cache
|
|
136
|
+
self.attention_bias = attention_bias
|
|
137
|
+
self.attention_dropout = attention_dropout
|
|
138
|
+
self.mlp_bias = mlp_bias
|
|
139
|
+
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
|
|
140
|
+
self.rope_parameters = rope_parameters
|
|
141
|
+
|
|
142
|
+
super().__init__(
|
|
143
|
+
pad_token_id=pad_token_id,
|
|
144
|
+
bos_token_id=bos_token_id,
|
|
145
|
+
eos_token_id=eos_token_id,
|
|
146
|
+
tie_word_embeddings=tie_word_embeddings,
|
|
147
|
+
**kwargs,
|
|
148
|
+
)
|
|
149
|
+
self.layer_norm_eps = layer_norm_eps
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
__all__ = ["Jais2Config"]
|