transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +20 -1
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +68 -5
- transformers/core_model_loading.py +201 -35
- transformers/dependency_versions_table.py +1 -1
- transformers/feature_extraction_utils.py +54 -22
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +162 -122
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +101 -64
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +2 -12
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +12 -0
- transformers/integrations/accelerate.py +44 -111
- transformers/integrations/aqlm.py +3 -5
- transformers/integrations/awq.py +2 -5
- transformers/integrations/bitnet.py +5 -8
- transformers/integrations/bitsandbytes.py +16 -15
- transformers/integrations/deepspeed.py +18 -3
- transformers/integrations/eetq.py +3 -5
- transformers/integrations/fbgemm_fp8.py +1 -1
- transformers/integrations/finegrained_fp8.py +6 -16
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/higgs.py +2 -5
- transformers/integrations/hub_kernels.py +23 -5
- transformers/integrations/integration_utils.py +35 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +4 -10
- transformers/integrations/peft.py +5 -0
- transformers/integrations/quanto.py +5 -2
- transformers/integrations/spqr.py +3 -5
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/vptq.py +3 -5
- transformers/modeling_gguf_pytorch_utils.py +66 -19
- transformers/modeling_rope_utils.py +78 -81
- transformers/modeling_utils.py +583 -503
- transformers/models/__init__.py +19 -0
- transformers/models/afmoe/modeling_afmoe.py +7 -16
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/align/modeling_align.py +12 -6
- transformers/models/altclip/modeling_altclip.py +7 -3
- transformers/models/apertus/modeling_apertus.py +4 -2
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +8 -4
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +27 -0
- transformers/models/auto/feature_extraction_auto.py +7 -3
- transformers/models/auto/image_processing_auto.py +4 -2
- transformers/models/auto/modeling_auto.py +31 -0
- transformers/models/auto/processing_auto.py +4 -0
- transformers/models/auto/tokenization_auto.py +132 -153
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +18 -19
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +9 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +3 -0
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
- transformers/models/bit/modeling_bit.py +5 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blenderbot/modeling_blenderbot.py +7 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +8 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -0
- transformers/models/bloom/modeling_bloom.py +13 -44
- transformers/models/blt/modeling_blt.py +162 -2
- transformers/models/blt/modular_blt.py +168 -3
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +6 -0
- transformers/models/bros/modeling_bros.py +8 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/canine/modeling_canine.py +6 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +9 -4
- transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +25 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clipseg/modeling_clipseg.py +4 -0
- transformers/models/clvp/modeling_clvp.py +14 -3
- transformers/models/code_llama/tokenization_code_llama.py +1 -1
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
- transformers/models/convbert/modeling_convbert.py +3 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +3 -1
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +14 -2
- transformers/models/cvt/modeling_cvt.py +5 -1
- transformers/models/cwm/modeling_cwm.py +1 -1
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +46 -39
- transformers/models/d_fine/modular_d_fine.py +15 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +1 -1
- transformers/models/dac/modeling_dac.py +4 -4
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +1 -1
- transformers/models/deberta/modeling_deberta.py +2 -0
- transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
- transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
- transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +8 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +12 -1
- transformers/models/dia/modular_dia.py +11 -0
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +3 -3
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
- transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +16 -12
- transformers/models/dots1/modeling_dots1.py +14 -5
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +5 -2
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +5 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +8 -2
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt_fast.py +46 -14
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +6 -1
- transformers/models/evolla/modeling_evolla.py +9 -1
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/falcon/modeling_falcon.py +3 -3
- transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
- transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
- transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +14 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +4 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
- transformers/models/florence2/modeling_florence2.py +20 -3
- transformers/models/florence2/modular_florence2.py +13 -0
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +16 -0
- transformers/models/gemma/modeling_gemma.py +10 -12
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma2/modular_gemma2.py +1 -1
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +28 -7
- transformers/models/gemma3/modular_gemma3.py +26 -6
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +47 -9
- transformers/models/gemma3n/modular_gemma3n.py +51 -9
- transformers/models/git/modeling_git.py +181 -126
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +15 -5
- transformers/models/glm4v/modular_glm4v.py +11 -3
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
- transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +8 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
- transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
- transformers/models/gptj/modeling_gptj.py +15 -6
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +2 -3
- transformers/models/granitemoe/modular_granitemoe.py +1 -2
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
- transformers/models/groupvit/modeling_groupvit.py +6 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
- transformers/models/hubert/modeling_hubert.py +4 -0
- transformers/models/hubert/modular_hubert.py +4 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +16 -0
- transformers/models/idefics/modeling_idefics.py +10 -0
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +9 -2
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +11 -8
- transformers/models/internvl/modular_internvl.py +5 -9
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +24 -19
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +15 -7
- transformers/models/janus/modular_janus.py +16 -7
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +14 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/configuration_lasr.py +4 -0
- transformers/models/lasr/modeling_lasr.py +3 -2
- transformers/models/lasr/modular_lasr.py +8 -1
- transformers/models/lasr/processing_lasr.py +0 -2
- transformers/models/layoutlm/modeling_layoutlm.py +5 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +18 -0
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lilt/modeling_lilt.py +19 -15
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +8 -4
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
- transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
- transformers/models/longt5/modeling_longt5.py +0 -4
- transformers/models/m2m_100/modeling_m2m_100.py +10 -0
- transformers/models/mamba/modeling_mamba.py +2 -1
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +3 -0
- transformers/models/markuplm/modeling_markuplm.py +5 -8
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +9 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +9 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mimi/modeling_mimi.py +25 -4
- transformers/models/minimax/modeling_minimax.py +16 -3
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +12 -4
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +13 -2
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +4 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
- transformers/models/modernbert/modeling_modernbert.py +12 -1
- transformers/models/modernbert/modular_modernbert.py +12 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
- transformers/models/moonshine/modeling_moonshine.py +1 -1
- transformers/models/moshi/modeling_moshi.py +21 -51
- transformers/models/mpnet/modeling_mpnet.py +2 -0
- transformers/models/mra/modeling_mra.py +4 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +0 -10
- transformers/models/musicgen/modeling_musicgen.py +5 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +1 -1
- transformers/models/nemotron/modeling_nemotron.py +3 -3
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +11 -16
- transformers/models/nystromformer/modeling_nystromformer.py +7 -0
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/olmoe/modeling_olmoe.py +12 -4
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +7 -38
- transformers/models/openai/modeling_openai.py +12 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +7 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +7 -3
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/modeling_parakeet.py +5 -0
- transformers/models/parakeet/modular_parakeet.py +5 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
- transformers/models/patchtst/modeling_patchtst.py +5 -4
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/models/pe_audio/processing_pe_audio.py +24 -0
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +3 -0
- transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +5 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +12 -4
- transformers/models/phimoe/modular_phimoe.py +1 -1
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +1 -1
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +7 -0
- transformers/models/plbart/modular_plbart.py +6 -0
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +11 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prophetnet/modeling_prophetnet.py +2 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
- transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +7 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
- transformers/models/reformer/modeling_reformer.py +9 -1
- transformers/models/regnet/modeling_regnet.py +4 -0
- transformers/models/rembert/modeling_rembert.py +7 -1
- transformers/models/resnet/modeling_resnet.py +8 -3
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +4 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +1 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +5 -1
- transformers/models/sam2/modular_sam2.py +5 -1
- transformers/models/sam2_video/modeling_sam2_video.py +51 -43
- transformers/models/sam2_video/modular_sam2_video.py +31 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +23 -0
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +3 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +2 -2
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +63 -41
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
- transformers/models/speecht5/modeling_speecht5.py +28 -0
- transformers/models/splinter/modeling_splinter.py +9 -3
- transformers/models/squeezebert/modeling_squeezebert.py +2 -0
- transformers/models/stablelm/modeling_stablelm.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/swiftformer/modeling_swiftformer.py +4 -0
- transformers/models/swin/modeling_swin.py +16 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +49 -33
- transformers/models/swinv2/modeling_swinv2.py +41 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +1 -7
- transformers/models/t5gemma/modeling_t5gemma.py +1 -1
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +1 -1
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +12 -0
- transformers/models/timesfm/modular_timesfm.py +12 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
- transformers/models/trocr/modeling_trocr.py +1 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +4 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +3 -7
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +0 -6
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +7 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/visual_bert/modeling_visual_bert.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +4 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +5 -3
- transformers/models/x_clip/modeling_x_clip.py +2 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +10 -0
- transformers/models/xlm/modeling_xlm.py +13 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +4 -1
- transformers/models/zamba/modeling_zamba.py +2 -1
- transformers/models/zamba2/modeling_zamba2.py +3 -2
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +7 -0
- transformers/pipelines/__init__.py +9 -6
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/document_question_answering.py +1 -1
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +127 -56
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +9 -64
- transformers/quantizers/quantizer_aqlm.py +1 -18
- transformers/quantizers/quantizer_auto_round.py +1 -10
- transformers/quantizers/quantizer_awq.py +3 -8
- transformers/quantizers/quantizer_bitnet.py +1 -6
- transformers/quantizers/quantizer_bnb_4bit.py +9 -49
- transformers/quantizers/quantizer_bnb_8bit.py +9 -19
- transformers/quantizers/quantizer_compressed_tensors.py +1 -4
- transformers/quantizers/quantizer_eetq.py +2 -12
- transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
- transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
- transformers/quantizers/quantizer_fp_quant.py +4 -4
- transformers/quantizers/quantizer_gptq.py +1 -4
- transformers/quantizers/quantizer_higgs.py +2 -6
- transformers/quantizers/quantizer_mxfp4.py +2 -28
- transformers/quantizers/quantizer_quanto.py +14 -14
- transformers/quantizers/quantizer_spqr.py +3 -8
- transformers/quantizers/quantizer_torchao.py +28 -124
- transformers/quantizers/quantizer_vptq.py +1 -10
- transformers/testing_utils.py +28 -12
- transformers/tokenization_mistral_common.py +3 -2
- transformers/tokenization_utils_base.py +3 -2
- transformers/tokenization_utils_tokenizers.py +25 -2
- transformers/trainer.py +24 -2
- transformers/trainer_callback.py +8 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/training_args.py +8 -10
- transformers/utils/__init__.py +4 -0
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +34 -25
- transformers/utils/generic.py +20 -0
- transformers/utils/import_utils.py +51 -9
- transformers/utils/kernel_config.py +71 -18
- transformers/utils/quantization_config.py +8 -8
- transformers/video_processing_utils.py +16 -12
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -196,7 +196,7 @@ class DeepseekVLModel(DeepseekVLPreTrainedModel):
|
|
|
196
196
|
use_cache: Optional[bool] = None,
|
|
197
197
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
198
198
|
**kwargs,
|
|
199
|
-
):
|
|
199
|
+
) -> DeepseekVLBaseModelOutputWithPast:
|
|
200
200
|
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
201
201
|
raise ValueError(
|
|
202
202
|
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
|
|
@@ -268,7 +268,7 @@ class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMi
|
|
|
268
268
|
use_cache: Optional[bool] = None,
|
|
269
269
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
270
270
|
**kwargs: Unpack[TransformersKwargs],
|
|
271
|
-
):
|
|
271
|
+
) -> DeepseekVLCausalLMOutputWithPast:
|
|
272
272
|
r"""
|
|
273
273
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
274
274
|
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
|
@@ -315,6 +315,7 @@ class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMi
|
|
|
315
315
|
inputs_embeds=None,
|
|
316
316
|
cache_position=None,
|
|
317
317
|
logits_to_keep=None,
|
|
318
|
+
is_first_iteration=False,
|
|
318
319
|
**kwargs,
|
|
319
320
|
):
|
|
320
321
|
# Overwritten -- extra custom processing
|
|
@@ -326,12 +327,15 @@ class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMi
|
|
|
326
327
|
attention_mask=attention_mask,
|
|
327
328
|
cache_position=cache_position,
|
|
328
329
|
logits_to_keep=logits_to_keep,
|
|
330
|
+
is_first_iteration=is_first_iteration,
|
|
329
331
|
**kwargs,
|
|
330
332
|
)
|
|
331
333
|
|
|
332
|
-
#
|
|
333
|
-
#
|
|
334
|
-
|
|
334
|
+
# Pixel values are used only in the first iteration if available
|
|
335
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
336
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
337
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
338
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
335
339
|
model_inputs["pixel_values"] = pixel_values
|
|
336
340
|
|
|
337
341
|
return model_inputs
|
|
@@ -134,6 +134,9 @@ class DeepseekVLAligner(nn.Module):
|
|
|
134
134
|
class DeepseekVLPreTrainedModel(JanusPreTrainedModel):
|
|
135
135
|
_no_split_modules = ["LlamaDecoderLayer"]
|
|
136
136
|
|
|
137
|
+
def _init_weights(self, module):
|
|
138
|
+
raise AttributeError("No need to inherit!")
|
|
139
|
+
|
|
137
140
|
|
|
138
141
|
@auto_docstring
|
|
139
142
|
class DeepseekVLModel(JanusModel):
|
|
@@ -207,9 +207,6 @@ class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast):
|
|
|
207
207
|
)
|
|
208
208
|
high_res_processed_images_grouped[shape] = stacked_high_res_images
|
|
209
209
|
high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index)
|
|
210
|
-
high_res_processed_images = (
|
|
211
|
-
torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images
|
|
212
|
-
)
|
|
213
210
|
|
|
214
211
|
resized_images_grouped = {}
|
|
215
212
|
for shape, stacked_high_res_padded_images in high_res_padded_images.items():
|
|
@@ -233,7 +230,6 @@ class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast):
|
|
|
233
230
|
)
|
|
234
231
|
processed_images_grouped[shape] = stacked_images
|
|
235
232
|
processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index)
|
|
236
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
237
233
|
|
|
238
234
|
return BatchFeature(
|
|
239
235
|
data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images},
|
|
@@ -314,7 +314,7 @@ class DeepseekVLHybridModel(DeepseekVLHybridPreTrainedModel):
|
|
|
314
314
|
use_cache: Optional[bool] = None,
|
|
315
315
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
316
316
|
**kwargs,
|
|
317
|
-
):
|
|
317
|
+
) -> DeepseekVLHybridBaseModelOutputWithPast:
|
|
318
318
|
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
319
319
|
raise ValueError(
|
|
320
320
|
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
|
|
@@ -424,7 +424,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLHybridPreTrainedModel,
|
|
|
424
424
|
use_cache: Optional[bool] = None,
|
|
425
425
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
426
426
|
**kwargs: Unpack[TransformersKwargs],
|
|
427
|
-
):
|
|
427
|
+
) -> DeepseekVLHybridCausalLMOutputWithPast:
|
|
428
428
|
r"""
|
|
429
429
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
430
430
|
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
|
@@ -473,6 +473,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLHybridPreTrainedModel,
|
|
|
473
473
|
attention_mask=None,
|
|
474
474
|
cache_position=None,
|
|
475
475
|
logits_to_keep=None,
|
|
476
|
+
is_first_iteration=False,
|
|
476
477
|
**kwargs,
|
|
477
478
|
):
|
|
478
479
|
model_inputs = super().prepare_inputs_for_generation(
|
|
@@ -482,12 +483,15 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLHybridPreTrainedModel,
|
|
|
482
483
|
attention_mask=attention_mask,
|
|
483
484
|
cache_position=cache_position,
|
|
484
485
|
logits_to_keep=logits_to_keep,
|
|
486
|
+
is_first_iteration=is_first_iteration,
|
|
485
487
|
**kwargs,
|
|
486
488
|
)
|
|
487
489
|
|
|
488
|
-
if
|
|
489
|
-
#
|
|
490
|
-
#
|
|
490
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
491
|
+
# Pixel values are used only in the first iteration if available
|
|
492
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
493
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
494
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
491
495
|
model_inputs["pixel_values"] = pixel_values
|
|
492
496
|
model_inputs["high_res_pixel_values"] = high_res_pixel_values
|
|
493
497
|
|
|
@@ -297,7 +297,7 @@ class DeepseekVLHybridModel(DeepseekVLModel):
|
|
|
297
297
|
use_cache: Optional[bool] = None,
|
|
298
298
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
299
299
|
**kwargs,
|
|
300
|
-
):
|
|
300
|
+
) -> DeepseekVLHybridBaseModelOutputWithPast:
|
|
301
301
|
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
302
302
|
raise ValueError(
|
|
303
303
|
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
|
|
@@ -361,7 +361,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
|
|
|
361
361
|
use_cache: Optional[bool] = None,
|
|
362
362
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
363
363
|
**kwargs: Unpack[TransformersKwargs],
|
|
364
|
-
):
|
|
364
|
+
) -> DeepseekVLHybridCausalLMOutputWithPast:
|
|
365
365
|
r"""
|
|
366
366
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
367
367
|
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
|
@@ -410,6 +410,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
|
|
|
410
410
|
attention_mask=None,
|
|
411
411
|
cache_position=None,
|
|
412
412
|
logits_to_keep=None,
|
|
413
|
+
is_first_iteration=False,
|
|
413
414
|
**kwargs,
|
|
414
415
|
):
|
|
415
416
|
model_inputs = super().prepare_inputs_for_generation(
|
|
@@ -419,12 +420,15 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
|
|
|
419
420
|
attention_mask=attention_mask,
|
|
420
421
|
cache_position=cache_position,
|
|
421
422
|
logits_to_keep=logits_to_keep,
|
|
423
|
+
is_first_iteration=is_first_iteration,
|
|
422
424
|
**kwargs,
|
|
423
425
|
)
|
|
424
426
|
|
|
425
|
-
if
|
|
426
|
-
#
|
|
427
|
-
#
|
|
427
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
428
|
+
# Pixel values are used only in the first iteration if available
|
|
429
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
430
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
431
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
428
432
|
model_inputs["pixel_values"] = pixel_values
|
|
429
433
|
model_inputs["high_res_pixel_values"] = high_res_pixel_values
|
|
430
434
|
|
|
@@ -888,9 +892,6 @@ class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
|
|
|
888
892
|
)
|
|
889
893
|
high_res_processed_images_grouped[shape] = stacked_high_res_images
|
|
890
894
|
high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index)
|
|
891
|
-
high_res_processed_images = (
|
|
892
|
-
torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images
|
|
893
|
-
)
|
|
894
895
|
|
|
895
896
|
resized_images_grouped = {}
|
|
896
897
|
for shape, stacked_high_res_padded_images in high_res_padded_images.items():
|
|
@@ -914,7 +915,6 @@ class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
|
|
|
914
915
|
)
|
|
915
916
|
processed_images_grouped[shape] = stacked_images
|
|
916
917
|
processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index)
|
|
917
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
918
918
|
|
|
919
919
|
return BatchFeature(
|
|
920
920
|
data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images},
|
|
@@ -37,7 +37,7 @@ class DeformableDetrConfig(PreTrainedConfig):
|
|
|
37
37
|
use_timm_backbone (`bool`, *optional*, defaults to `True`):
|
|
38
38
|
Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
|
|
39
39
|
API.
|
|
40
|
-
backbone_config (`PreTrainedConfig
|
|
40
|
+
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `ResNetConfig()`):
|
|
41
41
|
The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
|
|
42
42
|
case it will default to `ResNetConfig()`.
|
|
43
43
|
num_channels (`int`, *optional*, defaults to 3):
|
|
@@ -269,8 +269,8 @@ class DeformableDetrConfig(PreTrainedConfig):
|
|
|
269
269
|
self.eos_coefficient = eos_coefficient
|
|
270
270
|
self.focal_alpha = focal_alpha
|
|
271
271
|
self.disable_custom_kernels = disable_custom_kernels
|
|
272
|
+
|
|
272
273
|
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
|
|
273
|
-
self.tie_encoder_decoder = True
|
|
274
274
|
|
|
275
275
|
|
|
276
276
|
__all__ = ["DeformableDetrConfig"]
|
|
@@ -956,7 +956,7 @@ class DeformableDetrPreTrainedModel(PreTrainedModel):
|
|
|
956
956
|
init.constant_(module.value_proj.bias, 0.0)
|
|
957
957
|
init.xavier_uniform_(module.output_proj.weight)
|
|
958
958
|
init.constant_(module.output_proj.bias, 0.0)
|
|
959
|
-
elif isinstance(module, (nn.Linear, nn.Conv2d
|
|
959
|
+
elif isinstance(module, (nn.Linear, nn.Conv2d)):
|
|
960
960
|
init.normal_(module.weight, mean=0.0, std=std)
|
|
961
961
|
if module.bias is not None:
|
|
962
962
|
init.zeros_(module.bias)
|
|
@@ -34,9 +34,8 @@ class DepthAnythingConfig(PreTrainedConfig):
|
|
|
34
34
|
documentation from [`PreTrainedConfig`] for more information.
|
|
35
35
|
|
|
36
36
|
Args:
|
|
37
|
-
backbone_config (`Union[dict
|
|
38
|
-
The configuration of the backbone model.
|
|
39
|
-
leverage the [`AutoBackbone`] API.
|
|
37
|
+
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `Dinov2Config()`):
|
|
38
|
+
The configuration of the backbone model.
|
|
40
39
|
backbone (`str`, *optional*):
|
|
41
40
|
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
|
|
42
41
|
will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
|
|
@@ -94,7 +94,6 @@ class DepthProImageProcessorFast(BaseImageProcessorFast):
|
|
|
94
94
|
processed_images_grouped[shape] = stacked_images
|
|
95
95
|
|
|
96
96
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
97
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
98
97
|
|
|
99
98
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
100
99
|
|
|
@@ -37,7 +37,7 @@ class DetrConfig(PreTrainedConfig):
|
|
|
37
37
|
use_timm_backbone (`bool`, *optional*, defaults to `True`):
|
|
38
38
|
Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
|
|
39
39
|
API.
|
|
40
|
-
backbone_config (`PreTrainedConfig
|
|
40
|
+
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `ResNetConfig()`):
|
|
41
41
|
The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
|
|
42
42
|
case it will default to `ResNetConfig()`.
|
|
43
43
|
num_channels (`int`, *optional*, defaults to 3):
|
|
@@ -741,7 +741,7 @@ class DetrPreTrainedModel(PreTrainedModel):
|
|
|
741
741
|
elif isinstance(module, DetrLearnedPositionEmbedding):
|
|
742
742
|
init.uniform_(module.row_embeddings.weight)
|
|
743
743
|
init.uniform_(module.column_embeddings.weight)
|
|
744
|
-
if isinstance(module, (nn.Linear, nn.Conv2d
|
|
744
|
+
if isinstance(module, (nn.Linear, nn.Conv2d)):
|
|
745
745
|
init.normal_(module.weight, mean=0.0, std=std)
|
|
746
746
|
if module.bias is not None:
|
|
747
747
|
init.zeros_(module.bias)
|
|
@@ -750,6 +750,9 @@ class DetrPreTrainedModel(PreTrainedModel):
|
|
|
750
750
|
# Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
|
|
751
751
|
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
|
|
752
752
|
init.zeros_(module.weight[module.padding_idx])
|
|
753
|
+
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
|
|
754
|
+
init.ones_(module.weight)
|
|
755
|
+
init.zeros_(module.bias)
|
|
753
756
|
|
|
754
757
|
|
|
755
758
|
class DetrEncoder(DetrPreTrainedModel):
|
|
@@ -1457,8 +1460,12 @@ class DetrForSegmentation(DetrPreTrainedModel):
|
|
|
1457
1460
|
|
|
1458
1461
|
>>> # A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found
|
|
1459
1462
|
>>> panoptic_seg = result[0]["segmentation"]
|
|
1463
|
+
>>> panoptic_seg.shape
|
|
1464
|
+
torch.Size([300, 500])
|
|
1460
1465
|
>>> # Get prediction score and segment_id to class_id mapping of each segment
|
|
1461
1466
|
>>> panoptic_segments_info = result[0]["segments_info"]
|
|
1467
|
+
>>> len(panoptic_segments_info)
|
|
1468
|
+
5
|
|
1462
1469
|
```"""
|
|
1463
1470
|
|
|
1464
1471
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
@@ -110,11 +110,9 @@ class DiaGenerationMixin(GenerationMixin):
|
|
|
110
110
|
return merged_processors
|
|
111
111
|
|
|
112
112
|
def _prepare_generation_config(
|
|
113
|
-
self, generation_config: Optional[GenerationConfig],
|
|
113
|
+
self, generation_config: Optional[GenerationConfig], **kwargs: Any
|
|
114
114
|
) -> tuple[GenerationConfig, dict]:
|
|
115
|
-
generation_config, model_kwargs = super()._prepare_generation_config(
|
|
116
|
-
generation_config, use_model_defaults, **kwargs
|
|
117
|
-
)
|
|
115
|
+
generation_config, model_kwargs = super()._prepare_generation_config(generation_config, **kwargs)
|
|
118
116
|
|
|
119
117
|
# We allow generation up to max length + max delay pattern
|
|
120
118
|
# (will revert back to max length after generation)
|
|
@@ -260,7 +258,6 @@ class DiaGenerationMixin(GenerationMixin):
|
|
|
260
258
|
streamer: Optional["BaseStreamer"] = None,
|
|
261
259
|
negative_prompt_ids: Optional[torch.Tensor] = None,
|
|
262
260
|
negative_prompt_attention_mask: Optional[torch.Tensor] = None,
|
|
263
|
-
use_model_defaults: Optional[bool] = None,
|
|
264
261
|
custom_generate: Optional[str] = None,
|
|
265
262
|
**kwargs,
|
|
266
263
|
):
|
|
@@ -273,9 +270,7 @@ class DiaGenerationMixin(GenerationMixin):
|
|
|
273
270
|
assistant_model,
|
|
274
271
|
streamer,
|
|
275
272
|
)
|
|
276
|
-
generation_config, model_kwargs = self._prepare_generation_config(
|
|
277
|
-
generation_config, use_model_defaults, **kwargs
|
|
278
|
-
)
|
|
273
|
+
generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
|
|
279
274
|
generation_mode = generation_config.get_generation_mode(assistant_model)
|
|
280
275
|
|
|
281
276
|
if generation_mode not in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
|
|
@@ -425,7 +420,6 @@ class DiaGenerationMixin(GenerationMixin):
|
|
|
425
420
|
streamer: Optional["BaseStreamer"] = None,
|
|
426
421
|
negative_prompt_ids: Optional[torch.Tensor] = None,
|
|
427
422
|
negative_prompt_attention_mask: Optional[torch.Tensor] = None,
|
|
428
|
-
use_model_defaults: Optional[bool] = None,
|
|
429
423
|
custom_generate: Optional[str] = None,
|
|
430
424
|
**kwargs,
|
|
431
425
|
) -> Union[GenerateOutput, torch.LongTensor]:
|
|
@@ -445,7 +439,6 @@ class DiaGenerationMixin(GenerationMixin):
|
|
|
445
439
|
streamer=streamer,
|
|
446
440
|
negative_prompt_ids=negative_prompt_ids,
|
|
447
441
|
negative_prompt_attention_mask=negative_prompt_attention_mask,
|
|
448
|
-
use_model_defaults=use_model_defaults,
|
|
449
442
|
custom_generate=custom_generate,
|
|
450
443
|
**kwargs,
|
|
451
444
|
)
|
|
@@ -25,6 +25,7 @@ from typing import Optional, Union
|
|
|
25
25
|
import torch
|
|
26
26
|
from torch import nn
|
|
27
27
|
|
|
28
|
+
from ... import initialization as init
|
|
28
29
|
from ...activations import ACT2FN
|
|
29
30
|
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
|
30
31
|
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
@@ -61,6 +62,12 @@ class DiaPreTrainedModel(PreTrainedModel):
|
|
|
61
62
|
main_input_name = "input_ids"
|
|
62
63
|
_no_split_modules = ["DiaEncoderLayer", "DiaDecoderLayer"]
|
|
63
64
|
|
|
65
|
+
def _init_weights(self, module):
|
|
66
|
+
super()._init_weights(module)
|
|
67
|
+
if isinstance(module, DiaMultiChannelEmbedding):
|
|
68
|
+
offsets = torch.arange(self.config.num_channels, dtype=torch.long) * self.config.vocab_size
|
|
69
|
+
init.copy_(module.offsets, offsets)
|
|
70
|
+
|
|
64
71
|
|
|
65
72
|
class DiaMultiChannelEmbedding(nn.Module):
|
|
66
73
|
"""In order to efficiently compute the audio embedding from the 9 different channels,
|
|
@@ -146,7 +153,7 @@ class DiaRotaryEmbedding(nn.Module):
|
|
|
146
153
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
147
154
|
|
|
148
155
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
149
|
-
self.original_inv_freq =
|
|
156
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
150
157
|
|
|
151
158
|
@staticmethod
|
|
152
159
|
def compute_default_rope_parameters(
|
|
@@ -452,6 +459,8 @@ class DiaEncoder(DiaPreTrainedModel):
|
|
|
452
459
|
self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
|
|
453
460
|
self.rotary_emb = DiaRotaryEmbedding(config=config)
|
|
454
461
|
|
|
462
|
+
self.post_init()
|
|
463
|
+
|
|
455
464
|
@auto_docstring
|
|
456
465
|
@can_return_tuple
|
|
457
466
|
def forward(
|
|
@@ -578,6 +587,8 @@ class DiaDecoder(DiaPreTrainedModel):
|
|
|
578
587
|
self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
|
|
579
588
|
self.rotary_emb = DiaRotaryEmbedding(config=config)
|
|
580
589
|
|
|
590
|
+
self.post_init()
|
|
591
|
+
|
|
581
592
|
@auto_docstring
|
|
582
593
|
@can_return_tuple
|
|
583
594
|
def forward(
|
|
@@ -20,6 +20,7 @@ from typing import Optional, Union
|
|
|
20
20
|
import torch
|
|
21
21
|
from torch import nn
|
|
22
22
|
|
|
23
|
+
from ... import initialization as init
|
|
23
24
|
from ...cache_utils import DynamicCache, EncoderDecoderCache
|
|
24
25
|
from ...masking_utils import create_bidirectional_mask, create_causal_mask
|
|
25
26
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
@@ -59,6 +60,12 @@ class DiaPreTrainedModel(PreTrainedModel):
|
|
|
59
60
|
main_input_name = "input_ids"
|
|
60
61
|
_no_split_modules = ["DiaEncoderLayer", "DiaDecoderLayer"]
|
|
61
62
|
|
|
63
|
+
def _init_weights(self, module):
|
|
64
|
+
super()._init_weights(module)
|
|
65
|
+
if isinstance(module, DiaMultiChannelEmbedding):
|
|
66
|
+
offsets = torch.arange(self.config.num_channels, dtype=torch.long) * self.config.vocab_size
|
|
67
|
+
init.copy_(module.offsets, offsets)
|
|
68
|
+
|
|
62
69
|
|
|
63
70
|
class DiaMultiChannelEmbedding(nn.Module):
|
|
64
71
|
"""In order to efficiently compute the audio embedding from the 9 different channels,
|
|
@@ -241,6 +248,8 @@ class DiaEncoder(DiaPreTrainedModel):
|
|
|
241
248
|
self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
|
|
242
249
|
self.rotary_emb = DiaRotaryEmbedding(config=config)
|
|
243
250
|
|
|
251
|
+
self.post_init()
|
|
252
|
+
|
|
244
253
|
@auto_docstring
|
|
245
254
|
@can_return_tuple
|
|
246
255
|
def forward(
|
|
@@ -367,6 +376,8 @@ class DiaDecoder(DiaPreTrainedModel):
|
|
|
367
376
|
self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
|
|
368
377
|
self.rotary_emb = DiaRotaryEmbedding(config=config)
|
|
369
378
|
|
|
379
|
+
self.post_init()
|
|
380
|
+
|
|
370
381
|
@auto_docstring
|
|
371
382
|
@can_return_tuple
|
|
372
383
|
def forward(
|
|
@@ -74,7 +74,7 @@ class DiaProcessor(ProcessorMixin):
|
|
|
74
74
|
tokenizer (`DiaTokenizer`):
|
|
75
75
|
An instance of [`DiaTokenizer`]. The tokenizer is a required input.
|
|
76
76
|
audio_tokenizer (`DacModel`):
|
|
77
|
-
An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is
|
|
77
|
+
An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is a required input.
|
|
78
78
|
"""
|
|
79
79
|
|
|
80
80
|
audio_tokenizer_class = "DacModel"
|
|
@@ -86,7 +86,7 @@ class DiffLlamaRotaryEmbedding(nn.Module):
|
|
|
86
86
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
87
87
|
|
|
88
88
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
89
|
-
self.original_inv_freq =
|
|
89
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
90
90
|
|
|
91
91
|
@staticmethod
|
|
92
92
|
def compute_default_rope_parameters(
|
|
@@ -361,8 +361,8 @@ class DiffLlamaFlashAttention2(DiffLlamaAttention):
|
|
|
361
361
|
else torch.get_autocast_gpu_dtype()
|
|
362
362
|
)
|
|
363
363
|
# Handle the case where the model is quantized
|
|
364
|
-
elif hasattr(self.config, "
|
|
365
|
-
target_dtype = self.config.
|
|
364
|
+
elif hasattr(self.config, "quantization_config"):
|
|
365
|
+
target_dtype = self.config.dtype
|
|
366
366
|
else:
|
|
367
367
|
target_dtype = self.q_proj.weight.dtype
|
|
368
368
|
|
|
@@ -236,8 +236,8 @@ class DiffLlamaFlashAttention2(DiffLlamaAttention):
|
|
|
236
236
|
else torch.get_autocast_gpu_dtype()
|
|
237
237
|
)
|
|
238
238
|
# Handle the case where the model is quantized
|
|
239
|
-
elif hasattr(self.config, "
|
|
240
|
-
target_dtype = self.config.
|
|
239
|
+
elif hasattr(self.config, "quantization_config"):
|
|
240
|
+
target_dtype = self.config.dtype
|
|
241
241
|
else:
|
|
242
242
|
target_dtype = self.q_proj.weight.dtype
|
|
243
243
|
|
|
@@ -88,7 +88,6 @@ class DINOv3ViTImageProcessorFast(BaseImageProcessorFast):
|
|
|
88
88
|
processed_images_grouped[shape] = stacked_images
|
|
89
89
|
|
|
90
90
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
91
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
92
91
|
|
|
93
92
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
94
93
|
|
|
@@ -466,6 +466,9 @@ class DINOv3ViTPreTrainedModel(PreTrainedModel):
|
|
|
466
466
|
init.zeros_(module.mask_token)
|
|
467
467
|
elif isinstance(module, DINOv3ViTLayerScale):
|
|
468
468
|
init.constant_(module.lambda1, self.config.layerscale_value)
|
|
469
|
+
elif isinstance(module, DINOv3ViTRopePositionEmbedding):
|
|
470
|
+
inv_freq = 1 / module.base ** torch.arange(0, 1, 4 / module.head_dim, dtype=torch.float32)
|
|
471
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
469
472
|
|
|
470
473
|
|
|
471
474
|
@auto_docstring
|
|
@@ -361,6 +361,9 @@ class DINOv3ViTPreTrainedModel(Dinov2PreTrainedModel):
|
|
|
361
361
|
init.zeros_(module.mask_token)
|
|
362
362
|
elif isinstance(module, DINOv3ViTLayerScale):
|
|
363
363
|
init.constant_(module.lambda1, self.config.layerscale_value)
|
|
364
|
+
elif isinstance(module, DINOv3ViTRopePositionEmbedding):
|
|
365
|
+
inv_freq = 1 / module.base ** torch.arange(0, 1, 4 / module.head_dim, dtype=torch.float32)
|
|
366
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
364
367
|
|
|
365
368
|
|
|
366
369
|
@auto_docstring
|
|
@@ -305,15 +305,17 @@ class DistilBertPreTrainedModel(PreTrainedModel):
|
|
|
305
305
|
def _init_weights(self, module: nn.Module):
|
|
306
306
|
"""Initialize the weights."""
|
|
307
307
|
super()._init_weights(module)
|
|
308
|
-
if isinstance(module, Embeddings)
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
308
|
+
if isinstance(module, Embeddings):
|
|
309
|
+
if self.config.sinusoidal_pos_embds:
|
|
310
|
+
init.copy_(
|
|
311
|
+
module.position_embeddings.weight,
|
|
312
|
+
create_sinusoidal_embeddings(
|
|
313
|
+
self.config.max_position_embeddings,
|
|
314
|
+
self.config.dim,
|
|
315
|
+
torch.empty_like(module.position_embeddings.weight),
|
|
316
|
+
),
|
|
317
|
+
)
|
|
318
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
317
319
|
|
|
318
320
|
|
|
319
321
|
@auto_docstring
|
|
@@ -88,7 +88,7 @@ class DogeRotaryEmbedding(nn.Module):
|
|
|
88
88
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
89
89
|
|
|
90
90
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
91
|
-
self.original_inv_freq =
|
|
91
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
92
92
|
|
|
93
93
|
@staticmethod
|
|
94
94
|
def compute_default_rope_parameters(
|
|
@@ -231,7 +231,6 @@ class DonutImageProcessorFast(BaseImageProcessorFast):
|
|
|
231
231
|
processed_images_grouped[shape] = stacked_images
|
|
232
232
|
|
|
233
233
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
234
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
235
234
|
|
|
236
235
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
237
236
|
|
|
@@ -381,18 +381,7 @@ class DonutSwinSelfAttention(nn.Module):
|
|
|
381
381
|
torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
|
|
382
382
|
)
|
|
383
383
|
|
|
384
|
-
|
|
385
|
-
coords_h = torch.arange(self.window_size[0])
|
|
386
|
-
coords_w = torch.arange(self.window_size[1])
|
|
387
|
-
coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
|
|
388
|
-
coords_flatten = torch.flatten(coords, 1)
|
|
389
|
-
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
|
|
390
|
-
relative_coords = relative_coords.permute(1, 2, 0).contiguous()
|
|
391
|
-
relative_coords[:, :, 0] += self.window_size[0] - 1
|
|
392
|
-
relative_coords[:, :, 1] += self.window_size[1] - 1
|
|
393
|
-
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
|
|
394
|
-
relative_position_index = relative_coords.sum(-1)
|
|
395
|
-
self.register_buffer("relative_position_index", relative_position_index)
|
|
384
|
+
self.register_buffer("relative_position_index", self.create_relative_position_index())
|
|
396
385
|
|
|
397
386
|
self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
|
|
398
387
|
self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
|
|
@@ -451,6 +440,20 @@ class DonutSwinSelfAttention(nn.Module):
|
|
|
451
440
|
|
|
452
441
|
return outputs
|
|
453
442
|
|
|
443
|
+
def create_relative_position_index(self):
|
|
444
|
+
# get pair-wise relative position index for each token inside the window
|
|
445
|
+
coords_h = torch.arange(self.window_size[0])
|
|
446
|
+
coords_w = torch.arange(self.window_size[1])
|
|
447
|
+
coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
|
|
448
|
+
coords_flatten = torch.flatten(coords, 1)
|
|
449
|
+
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
|
|
450
|
+
relative_coords = relative_coords.permute(1, 2, 0).contiguous()
|
|
451
|
+
relative_coords[:, :, 0] += self.window_size[0] - 1
|
|
452
|
+
relative_coords[:, :, 1] += self.window_size[1] - 1
|
|
453
|
+
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
|
|
454
|
+
relative_position_index = relative_coords.sum(-1)
|
|
455
|
+
return relative_position_index
|
|
456
|
+
|
|
454
457
|
|
|
455
458
|
# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput
|
|
456
459
|
class DonutSwinSelfOutput(nn.Module):
|
|
@@ -801,6 +804,7 @@ class DonutSwinPreTrainedModel(PreTrainedModel):
|
|
|
801
804
|
init.zeros_(module.position_embeddings)
|
|
802
805
|
elif isinstance(module, DonutSwinSelfAttention):
|
|
803
806
|
init.zeros_(module.relative_position_bias_table)
|
|
807
|
+
init.copy_(module.relative_position_index, module.create_relative_position_index())
|
|
804
808
|
|
|
805
809
|
|
|
806
810
|
@auto_docstring
|