transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +20 -1
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +68 -5
- transformers/core_model_loading.py +201 -35
- transformers/dependency_versions_table.py +1 -1
- transformers/feature_extraction_utils.py +54 -22
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +162 -122
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +101 -64
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +2 -12
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +12 -0
- transformers/integrations/accelerate.py +44 -111
- transformers/integrations/aqlm.py +3 -5
- transformers/integrations/awq.py +2 -5
- transformers/integrations/bitnet.py +5 -8
- transformers/integrations/bitsandbytes.py +16 -15
- transformers/integrations/deepspeed.py +18 -3
- transformers/integrations/eetq.py +3 -5
- transformers/integrations/fbgemm_fp8.py +1 -1
- transformers/integrations/finegrained_fp8.py +6 -16
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/higgs.py +2 -5
- transformers/integrations/hub_kernels.py +23 -5
- transformers/integrations/integration_utils.py +35 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +4 -10
- transformers/integrations/peft.py +5 -0
- transformers/integrations/quanto.py +5 -2
- transformers/integrations/spqr.py +3 -5
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/vptq.py +3 -5
- transformers/modeling_gguf_pytorch_utils.py +66 -19
- transformers/modeling_rope_utils.py +78 -81
- transformers/modeling_utils.py +583 -503
- transformers/models/__init__.py +19 -0
- transformers/models/afmoe/modeling_afmoe.py +7 -16
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/align/modeling_align.py +12 -6
- transformers/models/altclip/modeling_altclip.py +7 -3
- transformers/models/apertus/modeling_apertus.py +4 -2
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +8 -4
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +27 -0
- transformers/models/auto/feature_extraction_auto.py +7 -3
- transformers/models/auto/image_processing_auto.py +4 -2
- transformers/models/auto/modeling_auto.py +31 -0
- transformers/models/auto/processing_auto.py +4 -0
- transformers/models/auto/tokenization_auto.py +132 -153
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +18 -19
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +9 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +3 -0
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
- transformers/models/bit/modeling_bit.py +5 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blenderbot/modeling_blenderbot.py +7 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +8 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -0
- transformers/models/bloom/modeling_bloom.py +13 -44
- transformers/models/blt/modeling_blt.py +162 -2
- transformers/models/blt/modular_blt.py +168 -3
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +6 -0
- transformers/models/bros/modeling_bros.py +8 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/canine/modeling_canine.py +6 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +9 -4
- transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +25 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clipseg/modeling_clipseg.py +4 -0
- transformers/models/clvp/modeling_clvp.py +14 -3
- transformers/models/code_llama/tokenization_code_llama.py +1 -1
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
- transformers/models/convbert/modeling_convbert.py +3 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +3 -1
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +14 -2
- transformers/models/cvt/modeling_cvt.py +5 -1
- transformers/models/cwm/modeling_cwm.py +1 -1
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +46 -39
- transformers/models/d_fine/modular_d_fine.py +15 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +1 -1
- transformers/models/dac/modeling_dac.py +4 -4
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +1 -1
- transformers/models/deberta/modeling_deberta.py +2 -0
- transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
- transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
- transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +8 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +12 -1
- transformers/models/dia/modular_dia.py +11 -0
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +3 -3
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
- transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +16 -12
- transformers/models/dots1/modeling_dots1.py +14 -5
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +5 -2
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +5 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +8 -2
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt_fast.py +46 -14
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +6 -1
- transformers/models/evolla/modeling_evolla.py +9 -1
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/falcon/modeling_falcon.py +3 -3
- transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
- transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
- transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +14 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +4 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
- transformers/models/florence2/modeling_florence2.py +20 -3
- transformers/models/florence2/modular_florence2.py +13 -0
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +16 -0
- transformers/models/gemma/modeling_gemma.py +10 -12
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma2/modular_gemma2.py +1 -1
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +28 -7
- transformers/models/gemma3/modular_gemma3.py +26 -6
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +47 -9
- transformers/models/gemma3n/modular_gemma3n.py +51 -9
- transformers/models/git/modeling_git.py +181 -126
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +15 -5
- transformers/models/glm4v/modular_glm4v.py +11 -3
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
- transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +8 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
- transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
- transformers/models/gptj/modeling_gptj.py +15 -6
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +2 -3
- transformers/models/granitemoe/modular_granitemoe.py +1 -2
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
- transformers/models/groupvit/modeling_groupvit.py +6 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
- transformers/models/hubert/modeling_hubert.py +4 -0
- transformers/models/hubert/modular_hubert.py +4 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +16 -0
- transformers/models/idefics/modeling_idefics.py +10 -0
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +9 -2
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +11 -8
- transformers/models/internvl/modular_internvl.py +5 -9
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +24 -19
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +15 -7
- transformers/models/janus/modular_janus.py +16 -7
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +14 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/configuration_lasr.py +4 -0
- transformers/models/lasr/modeling_lasr.py +3 -2
- transformers/models/lasr/modular_lasr.py +8 -1
- transformers/models/lasr/processing_lasr.py +0 -2
- transformers/models/layoutlm/modeling_layoutlm.py +5 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +18 -0
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lilt/modeling_lilt.py +19 -15
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +8 -4
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
- transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
- transformers/models/longt5/modeling_longt5.py +0 -4
- transformers/models/m2m_100/modeling_m2m_100.py +10 -0
- transformers/models/mamba/modeling_mamba.py +2 -1
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +3 -0
- transformers/models/markuplm/modeling_markuplm.py +5 -8
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +9 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +9 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mimi/modeling_mimi.py +25 -4
- transformers/models/minimax/modeling_minimax.py +16 -3
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +12 -4
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +13 -2
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +4 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
- transformers/models/modernbert/modeling_modernbert.py +12 -1
- transformers/models/modernbert/modular_modernbert.py +12 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
- transformers/models/moonshine/modeling_moonshine.py +1 -1
- transformers/models/moshi/modeling_moshi.py +21 -51
- transformers/models/mpnet/modeling_mpnet.py +2 -0
- transformers/models/mra/modeling_mra.py +4 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +0 -10
- transformers/models/musicgen/modeling_musicgen.py +5 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +1 -1
- transformers/models/nemotron/modeling_nemotron.py +3 -3
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +11 -16
- transformers/models/nystromformer/modeling_nystromformer.py +7 -0
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/olmoe/modeling_olmoe.py +12 -4
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +7 -38
- transformers/models/openai/modeling_openai.py +12 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +7 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +7 -3
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/modeling_parakeet.py +5 -0
- transformers/models/parakeet/modular_parakeet.py +5 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
- transformers/models/patchtst/modeling_patchtst.py +5 -4
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/models/pe_audio/processing_pe_audio.py +24 -0
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +3 -0
- transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +5 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +12 -4
- transformers/models/phimoe/modular_phimoe.py +1 -1
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +1 -1
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +7 -0
- transformers/models/plbart/modular_plbart.py +6 -0
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +11 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prophetnet/modeling_prophetnet.py +2 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
- transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +7 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
- transformers/models/reformer/modeling_reformer.py +9 -1
- transformers/models/regnet/modeling_regnet.py +4 -0
- transformers/models/rembert/modeling_rembert.py +7 -1
- transformers/models/resnet/modeling_resnet.py +8 -3
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +4 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +1 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +5 -1
- transformers/models/sam2/modular_sam2.py +5 -1
- transformers/models/sam2_video/modeling_sam2_video.py +51 -43
- transformers/models/sam2_video/modular_sam2_video.py +31 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +23 -0
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +3 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +2 -2
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +63 -41
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
- transformers/models/speecht5/modeling_speecht5.py +28 -0
- transformers/models/splinter/modeling_splinter.py +9 -3
- transformers/models/squeezebert/modeling_squeezebert.py +2 -0
- transformers/models/stablelm/modeling_stablelm.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/swiftformer/modeling_swiftformer.py +4 -0
- transformers/models/swin/modeling_swin.py +16 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +49 -33
- transformers/models/swinv2/modeling_swinv2.py +41 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +1 -7
- transformers/models/t5gemma/modeling_t5gemma.py +1 -1
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +1 -1
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +12 -0
- transformers/models/timesfm/modular_timesfm.py +12 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
- transformers/models/trocr/modeling_trocr.py +1 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +4 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +3 -7
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +0 -6
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +7 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/visual_bert/modeling_visual_bert.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +4 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +5 -3
- transformers/models/x_clip/modeling_x_clip.py +2 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +10 -0
- transformers/models/xlm/modeling_xlm.py +13 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +4 -1
- transformers/models/zamba/modeling_zamba.py +2 -1
- transformers/models/zamba2/modeling_zamba2.py +3 -2
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +7 -0
- transformers/pipelines/__init__.py +9 -6
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/document_question_answering.py +1 -1
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +127 -56
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +9 -64
- transformers/quantizers/quantizer_aqlm.py +1 -18
- transformers/quantizers/quantizer_auto_round.py +1 -10
- transformers/quantizers/quantizer_awq.py +3 -8
- transformers/quantizers/quantizer_bitnet.py +1 -6
- transformers/quantizers/quantizer_bnb_4bit.py +9 -49
- transformers/quantizers/quantizer_bnb_8bit.py +9 -19
- transformers/quantizers/quantizer_compressed_tensors.py +1 -4
- transformers/quantizers/quantizer_eetq.py +2 -12
- transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
- transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
- transformers/quantizers/quantizer_fp_quant.py +4 -4
- transformers/quantizers/quantizer_gptq.py +1 -4
- transformers/quantizers/quantizer_higgs.py +2 -6
- transformers/quantizers/quantizer_mxfp4.py +2 -28
- transformers/quantizers/quantizer_quanto.py +14 -14
- transformers/quantizers/quantizer_spqr.py +3 -8
- transformers/quantizers/quantizer_torchao.py +28 -124
- transformers/quantizers/quantizer_vptq.py +1 -10
- transformers/testing_utils.py +28 -12
- transformers/tokenization_mistral_common.py +3 -2
- transformers/tokenization_utils_base.py +3 -2
- transformers/tokenization_utils_tokenizers.py +25 -2
- transformers/trainer.py +24 -2
- transformers/trainer_callback.py +8 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/training_args.py +8 -10
- transformers/utils/__init__.py +4 -0
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +34 -25
- transformers/utils/generic.py +20 -0
- transformers/utils/import_utils.py +51 -9
- transformers/utils/kernel_config.py +71 -18
- transformers/utils/quantization_config.py +8 -8
- transformers/video_processing_utils.py +16 -12
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -188,7 +188,7 @@ class Llama4TextRotaryEmbedding(nn.Module):
|
|
|
188
188
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
189
189
|
|
|
190
190
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
191
|
-
self.original_inv_freq =
|
|
191
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
192
192
|
|
|
193
193
|
@staticmethod
|
|
194
194
|
def compute_default_rope_parameters(
|
|
@@ -1387,6 +1387,7 @@ class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin):
|
|
|
1387
1387
|
attention_mask=None,
|
|
1388
1388
|
cache_position=None,
|
|
1389
1389
|
logits_to_keep=None,
|
|
1390
|
+
is_first_iteration=False,
|
|
1390
1391
|
**kwargs,
|
|
1391
1392
|
):
|
|
1392
1393
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1398,12 +1399,15 @@ class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin):
|
|
|
1398
1399
|
attention_mask=attention_mask,
|
|
1399
1400
|
cache_position=cache_position,
|
|
1400
1401
|
logits_to_keep=logits_to_keep,
|
|
1402
|
+
is_first_iteration=is_first_iteration,
|
|
1401
1403
|
**kwargs,
|
|
1402
1404
|
)
|
|
1403
1405
|
|
|
1404
|
-
if
|
|
1405
|
-
#
|
|
1406
|
-
#
|
|
1406
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
1407
|
+
# Pixel values are used only in the first iteration if available
|
|
1408
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
1409
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
1410
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
1407
1411
|
model_inputs["pixel_values"] = pixel_values
|
|
1408
1412
|
|
|
1409
1413
|
return model_inputs
|
|
@@ -149,7 +149,6 @@ class LlavaImageProcessorFast(BaseImageProcessorFast):
|
|
|
149
149
|
processed_images_grouped[shape] = stacked_images
|
|
150
150
|
|
|
151
151
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
152
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
153
152
|
|
|
154
153
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
155
154
|
|
|
@@ -202,10 +202,11 @@ class LlavaModel(LlavaPreTrainedModel):
|
|
|
202
202
|
image_features = self.multi_modal_projector(selected_image_feature)
|
|
203
203
|
|
|
204
204
|
if "image_sizes" in kwargs:
|
|
205
|
-
split_sizes =
|
|
206
|
-
(
|
|
207
|
-
|
|
208
|
-
|
|
205
|
+
split_sizes = (
|
|
206
|
+
(torch.as_tensor(kwargs["image_sizes"], device=image_features.device) // self.vision_tower.patch_size)
|
|
207
|
+
.prod(dim=-1)
|
|
208
|
+
.tolist()
|
|
209
|
+
)
|
|
209
210
|
image_features = torch.split(image_features.squeeze(0), split_sizes)
|
|
210
211
|
else:
|
|
211
212
|
image_features = list(image_features)
|
|
@@ -437,6 +438,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
|
|
|
437
438
|
attention_mask=None,
|
|
438
439
|
cache_position=None,
|
|
439
440
|
logits_to_keep=None,
|
|
441
|
+
is_first_iteration=False,
|
|
440
442
|
**kwargs,
|
|
441
443
|
):
|
|
442
444
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -448,12 +450,15 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
|
|
|
448
450
|
attention_mask=attention_mask,
|
|
449
451
|
cache_position=cache_position,
|
|
450
452
|
logits_to_keep=logits_to_keep,
|
|
453
|
+
is_first_iteration=is_first_iteration,
|
|
451
454
|
**kwargs,
|
|
452
455
|
)
|
|
453
456
|
|
|
454
|
-
if
|
|
455
|
-
#
|
|
456
|
-
#
|
|
457
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
458
|
+
# Pixel values are used only in the first iteration if available
|
|
459
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
460
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
461
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
457
462
|
model_inputs["pixel_values"] = pixel_values
|
|
458
463
|
|
|
459
464
|
return model_inputs
|
|
@@ -260,7 +260,6 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast):
|
|
|
260
260
|
|
|
261
261
|
if do_pad:
|
|
262
262
|
processed_images = self._pad_for_batching(processed_images)
|
|
263
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
264
263
|
return BatchFeature(
|
|
265
264
|
data={"pixel_values": processed_images, "image_sizes": image_sizes}, tensor_type=return_tensors
|
|
266
265
|
)
|
|
@@ -692,6 +692,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
|
|
|
692
692
|
attention_mask=None,
|
|
693
693
|
cache_position=None,
|
|
694
694
|
logits_to_keep=None,
|
|
695
|
+
is_first_iteration=False,
|
|
695
696
|
**kwargs,
|
|
696
697
|
):
|
|
697
698
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -703,12 +704,15 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
|
|
|
703
704
|
attention_mask=attention_mask,
|
|
704
705
|
cache_position=cache_position,
|
|
705
706
|
logits_to_keep=logits_to_keep,
|
|
707
|
+
is_first_iteration=is_first_iteration,
|
|
706
708
|
**kwargs,
|
|
707
709
|
)
|
|
708
710
|
|
|
709
|
-
#
|
|
710
|
-
#
|
|
711
|
-
|
|
711
|
+
# Pixel values are used only in the first iteration if available
|
|
712
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
713
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
714
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
715
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
712
716
|
model_inputs["pixel_values"] = pixel_values
|
|
713
717
|
model_inputs["image_sizes"] = image_sizes
|
|
714
718
|
|
|
@@ -868,6 +868,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene
|
|
|
868
868
|
attention_mask=None,
|
|
869
869
|
cache_position=None,
|
|
870
870
|
logits_to_keep=None,
|
|
871
|
+
is_first_iteration=False,
|
|
871
872
|
**kwargs,
|
|
872
873
|
):
|
|
873
874
|
# Overwritten -- extra custom processing
|
|
@@ -879,12 +880,15 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene
|
|
|
879
880
|
attention_mask=attention_mask,
|
|
880
881
|
cache_position=cache_position,
|
|
881
882
|
logits_to_keep=logits_to_keep,
|
|
883
|
+
is_first_iteration=is_first_iteration,
|
|
882
884
|
**kwargs,
|
|
883
885
|
)
|
|
884
886
|
|
|
885
|
-
#
|
|
886
|
-
#
|
|
887
|
-
|
|
887
|
+
# Pixel values are used only in the first iteration if available
|
|
888
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
889
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
890
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
891
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
888
892
|
model_inputs["pixel_values"] = pixel_values
|
|
889
893
|
model_inputs["pixel_values_videos"] = pixel_values_videos
|
|
890
894
|
model_inputs["image_sizes"] = image_sizes
|
|
@@ -693,6 +693,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration):
|
|
|
693
693
|
attention_mask=None,
|
|
694
694
|
cache_position=None,
|
|
695
695
|
logits_to_keep=None,
|
|
696
|
+
is_first_iteration=False,
|
|
696
697
|
**kwargs,
|
|
697
698
|
):
|
|
698
699
|
# Overwritten -- extra custom processing
|
|
@@ -704,12 +705,15 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration):
|
|
|
704
705
|
attention_mask=attention_mask,
|
|
705
706
|
cache_position=cache_position,
|
|
706
707
|
logits_to_keep=logits_to_keep,
|
|
708
|
+
is_first_iteration=is_first_iteration,
|
|
707
709
|
**kwargs,
|
|
708
710
|
)
|
|
709
711
|
|
|
710
|
-
#
|
|
711
|
-
#
|
|
712
|
-
|
|
712
|
+
# Pixel values are used only in the first iteration if available
|
|
713
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
714
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
715
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
716
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
713
717
|
model_inputs["pixel_values"] = pixel_values
|
|
714
718
|
model_inputs["pixel_values_videos"] = pixel_values_videos
|
|
715
719
|
model_inputs["image_sizes"] = image_sizes
|
|
@@ -279,7 +279,6 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
|
|
|
279
279
|
|
|
280
280
|
if do_pad:
|
|
281
281
|
processed_images = self._pad_for_batching(processed_images)
|
|
282
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
283
282
|
return BatchFeature(
|
|
284
283
|
data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images},
|
|
285
284
|
tensor_type=return_tensors,
|
|
@@ -846,6 +846,7 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene
|
|
|
846
846
|
attention_mask=None,
|
|
847
847
|
cache_position=None,
|
|
848
848
|
logits_to_keep=None,
|
|
849
|
+
is_first_iteration=False,
|
|
849
850
|
**kwargs,
|
|
850
851
|
):
|
|
851
852
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -857,12 +858,15 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene
|
|
|
857
858
|
attention_mask=attention_mask,
|
|
858
859
|
cache_position=cache_position,
|
|
859
860
|
logits_to_keep=logits_to_keep,
|
|
861
|
+
is_first_iteration=is_first_iteration,
|
|
860
862
|
**kwargs,
|
|
861
863
|
)
|
|
862
864
|
|
|
863
|
-
if
|
|
864
|
-
#
|
|
865
|
-
#
|
|
865
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
866
|
+
# Pixel values are used only in the first iteration if available
|
|
867
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
868
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
869
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
866
870
|
model_inputs["pixel_values"] = pixel_values
|
|
867
871
|
model_inputs["image_sizes"] = image_sizes
|
|
868
872
|
model_inputs["pixel_values_videos"] = pixel_values_videos
|
|
@@ -211,7 +211,6 @@ class LlavaOnevisionImageProcessorFast(LlavaNextImageProcessorFast):
|
|
|
211
211
|
|
|
212
212
|
if do_pad:
|
|
213
213
|
processed_images = self._pad_for_batching(processed_images)
|
|
214
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
215
214
|
return BatchFeature(
|
|
216
215
|
data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images},
|
|
217
216
|
tensor_type=return_tensors,
|
|
@@ -698,6 +697,7 @@ class LlavaOnevisionForConditionalGeneration(LlavaNextVideoForConditionalGenerat
|
|
|
698
697
|
attention_mask=None,
|
|
699
698
|
cache_position=None,
|
|
700
699
|
logits_to_keep=None,
|
|
700
|
+
is_first_iteration=False,
|
|
701
701
|
**kwargs,
|
|
702
702
|
):
|
|
703
703
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -709,12 +709,15 @@ class LlavaOnevisionForConditionalGeneration(LlavaNextVideoForConditionalGenerat
|
|
|
709
709
|
attention_mask=attention_mask,
|
|
710
710
|
cache_position=cache_position,
|
|
711
711
|
logits_to_keep=logits_to_keep,
|
|
712
|
+
is_first_iteration=is_first_iteration,
|
|
712
713
|
**kwargs,
|
|
713
714
|
)
|
|
714
715
|
|
|
715
|
-
if
|
|
716
|
-
#
|
|
717
|
-
#
|
|
716
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
717
|
+
# Pixel values are used only in the first iteration if available
|
|
718
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
719
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
720
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
718
721
|
model_inputs["pixel_values"] = pixel_values
|
|
719
722
|
model_inputs["image_sizes"] = image_sizes
|
|
720
723
|
model_inputs["pixel_values_videos"] = pixel_values_videos
|
|
@@ -82,7 +82,7 @@ class LongcatFlashRotaryEmbedding(nn.Module):
|
|
|
82
82
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
83
83
|
|
|
84
84
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
85
|
-
self.original_inv_freq =
|
|
85
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
86
86
|
|
|
87
87
|
@staticmethod
|
|
88
88
|
def compute_default_rope_parameters(
|
|
@@ -563,6 +563,7 @@ class LongcatFlashPreTrainedModel(PreTrainedModel):
|
|
|
563
563
|
super()._init_weights(module)
|
|
564
564
|
if isinstance(module, LongcatFlashTopkRouter):
|
|
565
565
|
init.normal_(module.classifier.weight, mean=0.0, std=self.config.initializer_range)
|
|
566
|
+
init.zeros_(module.e_score_correction_bias)
|
|
566
567
|
if isinstance(module, LongcatFlashExperts):
|
|
567
568
|
if module.gate_up_proj is not None:
|
|
568
569
|
init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
|
|
@@ -347,6 +347,7 @@ class LongcatFlashPreTrainedModel(PreTrainedModel):
|
|
|
347
347
|
super()._init_weights(module)
|
|
348
348
|
if isinstance(module, LongcatFlashTopkRouter):
|
|
349
349
|
init.normal_(module.classifier.weight, mean=0.0, std=self.config.initializer_range)
|
|
350
|
+
init.zeros_(module.e_score_correction_bias)
|
|
350
351
|
if isinstance(module, LongcatFlashExperts):
|
|
351
352
|
if module.gate_up_proj is not None:
|
|
352
353
|
init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
|
|
@@ -1583,12 +1583,10 @@ class LongT5Model(LongT5PreTrainedModel):
|
|
|
1583
1583
|
encoder_config = copy.deepcopy(config)
|
|
1584
1584
|
encoder_config.is_decoder = False
|
|
1585
1585
|
encoder_config.use_cache = False
|
|
1586
|
-
encoder_config.tie_encoder_decoder = False
|
|
1587
1586
|
self.encoder = LongT5Stack(encoder_config)
|
|
1588
1587
|
|
|
1589
1588
|
decoder_config = copy.deepcopy(config)
|
|
1590
1589
|
decoder_config.is_decoder = True
|
|
1591
|
-
decoder_config.tie_encoder_decoder = False
|
|
1592
1590
|
decoder_config.num_layers = config.num_decoder_layers
|
|
1593
1591
|
self.decoder = LongT5Stack(decoder_config)
|
|
1594
1592
|
|
|
@@ -1746,12 +1744,10 @@ class LongT5ForConditionalGeneration(LongT5PreTrainedModel, GenerationMixin):
|
|
|
1746
1744
|
encoder_config = copy.deepcopy(config)
|
|
1747
1745
|
encoder_config.is_decoder = False
|
|
1748
1746
|
encoder_config.use_cache = False
|
|
1749
|
-
encoder_config.tie_encoder_decoder = False
|
|
1750
1747
|
self.encoder = LongT5Stack(encoder_config)
|
|
1751
1748
|
|
|
1752
1749
|
decoder_config = copy.deepcopy(config)
|
|
1753
1750
|
decoder_config.is_decoder = True
|
|
1754
|
-
decoder_config.tie_encoder_decoder = False
|
|
1755
1751
|
decoder_config.num_layers = config.num_decoder_layers
|
|
1756
1752
|
self.decoder = LongT5Stack(decoder_config)
|
|
1757
1753
|
|
|
@@ -22,6 +22,7 @@ import torch
|
|
|
22
22
|
from torch import nn
|
|
23
23
|
from torch.nn import CrossEntropyLoss
|
|
24
24
|
|
|
25
|
+
from ... import initialization as init
|
|
25
26
|
from ...activations import ACT2FN
|
|
26
27
|
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
|
27
28
|
from ...generation import GenerationMixin
|
|
@@ -84,6 +85,7 @@ class M2M100SinusoidalPositionalEmbedding(nn.Module):
|
|
|
84
85
|
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
|
|
85
86
|
super().__init__()
|
|
86
87
|
self.offset = 2
|
|
88
|
+
self.num_positions = num_positions
|
|
87
89
|
self.embedding_dim = embedding_dim
|
|
88
90
|
self.padding_idx = padding_idx
|
|
89
91
|
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
|
|
@@ -515,6 +517,14 @@ class M2M100PreTrainedModel(PreTrainedModel):
|
|
|
515
517
|
# Doesn't support `compile` (dynamic control flow). Can be fixed but low usage model
|
|
516
518
|
_can_compile_fullgraph = False
|
|
517
519
|
|
|
520
|
+
def _init_weights(self, module):
|
|
521
|
+
super()._init_weights(module)
|
|
522
|
+
if isinstance(module, M2M100SinusoidalPositionalEmbedding):
|
|
523
|
+
emb_weights = module.get_embedding(
|
|
524
|
+
module.num_positions + module.offset, module.embedding_dim, module.padding_idx
|
|
525
|
+
)
|
|
526
|
+
init.copy_(module.weights, emb_weights)
|
|
527
|
+
|
|
518
528
|
|
|
519
529
|
class M2M100Encoder(M2M100PreTrainedModel):
|
|
520
530
|
"""
|
|
@@ -26,7 +26,7 @@ from ... import initialization as init
|
|
|
26
26
|
from ...activations import ACT2FN
|
|
27
27
|
from ...configuration_utils import PreTrainedConfig
|
|
28
28
|
from ...generation import GenerationMixin
|
|
29
|
-
from ...integrations
|
|
29
|
+
from ...integrations import lazy_load_kernel
|
|
30
30
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
31
31
|
from ...modeling_utils import PreTrainedModel
|
|
32
32
|
from ...utils import (
|
|
@@ -750,6 +750,7 @@ class MambaForCausalLM(MambaPreTrainedModel, GenerationMixin):
|
|
|
750
750
|
cache_params: Optional[MambaCache] = None,
|
|
751
751
|
cache_position: Optional[torch.LongTensor] = None,
|
|
752
752
|
attention_mask: Optional[torch.LongTensor] = None,
|
|
753
|
+
is_first_iteration: Optional[bool] = False,
|
|
753
754
|
**kwargs,
|
|
754
755
|
):
|
|
755
756
|
# Overwritten -- uses `cache_params` as opposed to `past_key_values`
|
|
@@ -24,6 +24,7 @@ from torch import nn
|
|
|
24
24
|
from ... import initialization as init
|
|
25
25
|
from ...activations import ACT2FN
|
|
26
26
|
from ...generation import GenerationMixin
|
|
27
|
+
from ...integrations import lazy_load_kernel
|
|
27
28
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
28
29
|
from ...modeling_utils import PreTrainedModel
|
|
29
30
|
from ...utils import (
|
|
@@ -31,35 +32,12 @@ from ...utils import (
|
|
|
31
32
|
auto_docstring,
|
|
32
33
|
logging,
|
|
33
34
|
)
|
|
34
|
-
from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
|
|
35
35
|
from .configuration_mamba2 import Mamba2Config
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
logger = logging.get_logger(__name__)
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
if is_mamba_2_ssm_available():
|
|
42
|
-
from mamba_ssm.ops.triton.selective_state_update import selective_state_update
|
|
43
|
-
from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
|
|
44
|
-
else:
|
|
45
|
-
mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined, selective_state_update = None, None, None
|
|
46
|
-
|
|
47
|
-
if is_causal_conv1d_available():
|
|
48
|
-
from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
|
|
49
|
-
else:
|
|
50
|
-
causal_conv1d_update, causal_conv1d_fn = None, None
|
|
51
|
-
|
|
52
|
-
is_fast_path_available = all(
|
|
53
|
-
(
|
|
54
|
-
selective_state_update,
|
|
55
|
-
mamba_chunk_scan_combined,
|
|
56
|
-
mamba_split_conv1d_scan_combined,
|
|
57
|
-
causal_conv1d_fn,
|
|
58
|
-
causal_conv1d_update,
|
|
59
|
-
)
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
|
|
63
41
|
# Helper methods for segment sum computation
|
|
64
42
|
|
|
65
43
|
|
|
@@ -286,6 +264,28 @@ class Mamba2Mixer(nn.Module):
|
|
|
286
264
|
self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
|
|
287
265
|
self.use_bias = config.use_bias
|
|
288
266
|
|
|
267
|
+
global causal_conv1d_update, causal_conv1d_fn
|
|
268
|
+
causal_conv1d = lazy_load_kernel("causal-conv1d")
|
|
269
|
+
causal_conv1d_update = getattr(causal_conv1d, "causal_conv1d_update", None)
|
|
270
|
+
causal_conv1d_fn = getattr(causal_conv1d, "causal_conv1d_fn", None)
|
|
271
|
+
|
|
272
|
+
global selective_state_update, mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
|
|
273
|
+
mamba_ssm = lazy_load_kernel("mamba-ssm")
|
|
274
|
+
selective_state_update = getattr(mamba_ssm, "selective_state_update", None)
|
|
275
|
+
mamba_chunk_scan_combined = getattr(mamba_ssm, "mamba_chunk_scan_combined", None)
|
|
276
|
+
mamba_split_conv1d_scan_combined = getattr(mamba_ssm, "mamba_split_conv1d_scan_combined", None)
|
|
277
|
+
|
|
278
|
+
global is_fast_path_available
|
|
279
|
+
is_fast_path_available = all(
|
|
280
|
+
(
|
|
281
|
+
selective_state_update,
|
|
282
|
+
mamba_chunk_scan_combined,
|
|
283
|
+
mamba_split_conv1d_scan_combined,
|
|
284
|
+
causal_conv1d_fn,
|
|
285
|
+
causal_conv1d_update,
|
|
286
|
+
)
|
|
287
|
+
)
|
|
288
|
+
|
|
289
289
|
if not is_fast_path_available:
|
|
290
290
|
logger.warning_once(
|
|
291
291
|
"The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"
|
|
@@ -955,6 +955,7 @@ class Mamba2ForCausalLM(Mamba2PreTrainedModel, GenerationMixin):
|
|
|
955
955
|
cache_params: Optional[Mamba2Cache] = None,
|
|
956
956
|
cache_position: Optional[torch.LongTensor] = None,
|
|
957
957
|
attention_mask: Optional[torch.Tensor] = None,
|
|
958
|
+
is_first_iteration: Optional[bool] = False,
|
|
958
959
|
**kwargs,
|
|
959
960
|
):
|
|
960
961
|
# Overwritten -- uses `cache_params` as opposed to `past_key_values`
|
|
@@ -147,7 +147,7 @@ class MarianConfig(PreTrainedConfig):
|
|
|
147
147
|
self.num_hidden_layers = encoder_layers
|
|
148
148
|
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
|
149
149
|
self.share_encoder_decoder_embeddings = share_encoder_decoder_embeddings
|
|
150
|
-
kwargs["
|
|
150
|
+
kwargs["tie_word_embeddings"] = share_encoder_decoder_embeddings
|
|
151
151
|
super().__init__(
|
|
152
152
|
pad_token_id=pad_token_id,
|
|
153
153
|
eos_token_id=eos_token_id,
|
|
@@ -451,6 +451,8 @@ class MarianPreTrainedModel(PreTrainedModel):
|
|
|
451
451
|
super()._init_weights(module)
|
|
452
452
|
if isinstance(module, MarianSinusoidalPositionalEmbedding):
|
|
453
453
|
init.copy_(module.weight, module.create_weight())
|
|
454
|
+
elif isinstance(module, MarianMTModel):
|
|
455
|
+
init.zeros_(module.final_logits_bias)
|
|
454
456
|
|
|
455
457
|
@property
|
|
456
458
|
def dummy_inputs(self):
|
|
@@ -1248,6 +1250,7 @@ class MarianDecoderWrapper(MarianPreTrainedModel):
|
|
|
1248
1250
|
def __init__(self, config):
|
|
1249
1251
|
super().__init__(config)
|
|
1250
1252
|
self.decoder = MarianDecoder(config)
|
|
1253
|
+
self.post_init()
|
|
1251
1254
|
|
|
1252
1255
|
def forward(self, *args, **kwargs):
|
|
1253
1256
|
return self.decoder(*args, **kwargs)
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
"""PyTorch MarkupLM model."""
|
|
16
16
|
|
|
17
|
-
import os
|
|
18
17
|
from collections.abc import Callable
|
|
19
18
|
from typing import Optional, Union
|
|
20
19
|
|
|
@@ -486,9 +485,9 @@ class MarkupLMEncoder(nn.Module):
|
|
|
486
485
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
|
487
486
|
|
|
488
487
|
layer_outputs = layer_module(
|
|
489
|
-
hidden_states
|
|
490
|
-
attention_mask
|
|
491
|
-
output_attentions
|
|
488
|
+
hidden_states,
|
|
489
|
+
attention_mask,
|
|
490
|
+
output_attentions,
|
|
492
491
|
**kwargs,
|
|
493
492
|
)
|
|
494
493
|
|
|
@@ -517,10 +516,8 @@ class MarkupLMPreTrainedModel(PreTrainedModel):
|
|
|
517
516
|
super()._init_weights(module)
|
|
518
517
|
if isinstance(module, MarkupLMLMPredictionHead):
|
|
519
518
|
init.zeros_(module.bias)
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
|
|
523
|
-
return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
|
519
|
+
elif isinstance(module, MarkupLMEmbeddings):
|
|
520
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
524
521
|
|
|
525
522
|
|
|
526
523
|
@auto_docstring
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
"""Mask2Former model configuration"""
|
|
16
16
|
|
|
17
|
-
from typing import Optional
|
|
17
|
+
from typing import Optional, Union
|
|
18
18
|
|
|
19
19
|
from ...configuration_utils import PreTrainedConfig
|
|
20
20
|
from ...utils import logging
|
|
@@ -39,7 +39,7 @@ class Mask2FormerConfig(PreTrainedConfig):
|
|
|
39
39
|
Currently, Mask2Former only supports the [Swin Transformer](swin) as backbone.
|
|
40
40
|
|
|
41
41
|
Args:
|
|
42
|
-
backbone_config (`PreTrainedConfig
|
|
42
|
+
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `SwinConfig()`):
|
|
43
43
|
The configuration of the backbone model. If unset, the configuration corresponding to
|
|
44
44
|
`swin-base-patch4-window12-384` will be used.
|
|
45
45
|
backbone (`str`, *optional*):
|
|
@@ -134,7 +134,7 @@ class Mask2FormerConfig(PreTrainedConfig):
|
|
|
134
134
|
|
|
135
135
|
def __init__(
|
|
136
136
|
self,
|
|
137
|
-
backbone_config: Optional[dict] = None,
|
|
137
|
+
backbone_config: Optional[Union[dict, PreTrainedConfig]] = None,
|
|
138
138
|
feature_size: int = 256,
|
|
139
139
|
mask_feature_size: int = 256,
|
|
140
140
|
hidden_dim: int = 256,
|
|
@@ -387,10 +387,7 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
387
387
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
388
388
|
processed_pixel_masks = reorder_images(processed_pixel_masks_grouped, grouped_images_index)
|
|
389
389
|
encoded_inputs = BatchFeature(
|
|
390
|
-
data={
|
|
391
|
-
"pixel_values": torch.stack(processed_images, dim=0) if return_tensors else processed_images,
|
|
392
|
-
"pixel_mask": torch.stack(processed_pixel_masks, dim=0) if return_tensors else processed_pixel_masks,
|
|
393
|
-
},
|
|
390
|
+
data={"pixel_values": processed_images, "pixel_mask": processed_pixel_masks},
|
|
394
391
|
tensor_type=return_tensors,
|
|
395
392
|
)
|
|
396
393
|
if segmentation_maps is not None:
|
|
@@ -2149,6 +2149,10 @@ class Mask2FormerPreTrainedModel(PreTrainedModel):
|
|
|
2149
2149
|
init.normal_(module.weight, mean=0.0, std=std)
|
|
2150
2150
|
if module.bias is not None:
|
|
2151
2151
|
init.zeros_(module.bias)
|
|
2152
|
+
if getattr(module, "running_mean", None) is not None:
|
|
2153
|
+
init.zeros_(module.running_mean)
|
|
2154
|
+
init.ones_(module.running_var)
|
|
2155
|
+
init.zeros_(module.num_batches_tracked)
|
|
2152
2156
|
|
|
2153
2157
|
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
|
|
2154
2158
|
init.ones_(module.weight)
|
|
@@ -2160,6 +2164,11 @@ class Mask2FormerPreTrainedModel(PreTrainedModel):
|
|
|
2160
2164
|
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
|
|
2161
2165
|
init.zeros_(module.weight[module.padding_idx])
|
|
2162
2166
|
|
|
2167
|
+
elif isinstance(module, Mask2FormerLoss):
|
|
2168
|
+
empty_weight = torch.ones(module.num_labels + 1)
|
|
2169
|
+
empty_weight[-1] = module.eos_coef
|
|
2170
|
+
init.copy_(module.empty_weight, empty_weight)
|
|
2171
|
+
|
|
2163
2172
|
if hasattr(module, "reference_points"):
|
|
2164
2173
|
init.xavier_uniform_(module.reference_points.weight, gain=1.0)
|
|
2165
2174
|
init.constant_(module.reference_points.bias, 0.0)
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
"""MaskFormer model configuration"""
|
|
16
16
|
|
|
17
|
-
from typing import Optional
|
|
17
|
+
from typing import Optional, Union
|
|
18
18
|
|
|
19
19
|
from ...configuration_utils import PreTrainedConfig
|
|
20
20
|
from ...utils import logging
|
|
@@ -49,7 +49,7 @@ class MaskFormerConfig(PreTrainedConfig):
|
|
|
49
49
|
use_auxiliary_loss(`bool`, *optional*, defaults to `False`):
|
|
50
50
|
If `True` [`MaskFormerForInstanceSegmentationOutput`] will contain the auxiliary losses computed using the
|
|
51
51
|
logits from each decoder's stage.
|
|
52
|
-
backbone_config (`
|
|
52
|
+
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `SwinConfig()`):
|
|
53
53
|
The configuration passed to the backbone, if unset, the configuration corresponding to
|
|
54
54
|
`swin-base-patch4-window12-384` will be used.
|
|
55
55
|
backbone (`str`, *optional*):
|
|
@@ -114,7 +114,7 @@ class MaskFormerConfig(PreTrainedConfig):
|
|
|
114
114
|
mask_feature_size: int = 256,
|
|
115
115
|
no_object_weight: float = 0.1,
|
|
116
116
|
use_auxiliary_loss: bool = False,
|
|
117
|
-
backbone_config: Optional[dict] = None,
|
|
117
|
+
backbone_config: Optional[Union[dict, PreTrainedConfig]] = None,
|
|
118
118
|
decoder_config: Optional[dict] = None,
|
|
119
119
|
init_std: float = 0.02,
|
|
120
120
|
init_xavier_std: float = 1.0,
|
|
@@ -391,10 +391,7 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
391
391
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
392
392
|
processed_pixel_masks = reorder_images(processed_pixel_masks_grouped, grouped_images_index)
|
|
393
393
|
encoded_inputs = BatchFeature(
|
|
394
|
-
data={
|
|
395
|
-
"pixel_values": torch.stack(processed_images, dim=0) if return_tensors else processed_images,
|
|
396
|
-
"pixel_mask": torch.stack(processed_pixel_masks, dim=0) if return_tensors else processed_pixel_masks,
|
|
397
|
-
},
|
|
394
|
+
data={"pixel_values": processed_images, "pixel_mask": processed_pixel_masks},
|
|
398
395
|
tensor_type=return_tensors,
|
|
399
396
|
)
|
|
400
397
|
if segmentation_maps is not None:
|
|
@@ -174,7 +174,7 @@ class MaskFormerModelOutput(ModelOutput):
|
|
|
174
174
|
custom_intro="""
|
|
175
175
|
Class for outputs of [`MaskFormerForInstanceSegmentation`].
|
|
176
176
|
|
|
177
|
-
This output can be directly passed to [`~MaskFormerImageProcessor.post_process_semantic_segmentation`] or
|
|
177
|
+
This output can be directly passed to [`~MaskFormerImageProcessor.post_process_semantic_segmentation`] or
|
|
178
178
|
[`~MaskFormerImageProcessor.post_process_instance_segmentation`] or
|
|
179
179
|
[`~MaskFormerImageProcessor.post_process_panoptic_segmentation`] depending on the task. Please, see
|
|
180
180
|
[`~MaskFormerImageProcessor] for details regarding usage.
|
|
@@ -1470,11 +1470,19 @@ class MaskFormerPreTrainedModel(PreTrainedModel):
|
|
|
1470
1470
|
init.normal_(module.weight, mean=0.0, std=std)
|
|
1471
1471
|
if module.bias is not None:
|
|
1472
1472
|
init.zeros_(module.bias)
|
|
1473
|
+
if getattr(module, "running_mean", None) is not None:
|
|
1474
|
+
init.zeros_(module.running_mean)
|
|
1475
|
+
init.ones_(module.running_var)
|
|
1476
|
+
init.zeros_(module.num_batches_tracked)
|
|
1473
1477
|
elif isinstance(module, nn.Embedding):
|
|
1474
1478
|
init.normal_(module.weight, mean=0.0, std=std)
|
|
1475
1479
|
# Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
|
|
1476
1480
|
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
|
|
1477
1481
|
init.zeros_(module.weight[module.padding_idx])
|
|
1482
|
+
elif isinstance(module, MaskFormerLoss):
|
|
1483
|
+
empty_weight = torch.ones(module.num_labels + 1)
|
|
1484
|
+
empty_weight[-1] = module.eos_coef
|
|
1485
|
+
init.copy_(module.empty_weight, empty_weight)
|
|
1478
1486
|
|
|
1479
1487
|
|
|
1480
1488
|
@auto_docstring
|