transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +20 -1
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +68 -5
- transformers/core_model_loading.py +201 -35
- transformers/dependency_versions_table.py +1 -1
- transformers/feature_extraction_utils.py +54 -22
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +162 -122
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +101 -64
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +2 -12
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +12 -0
- transformers/integrations/accelerate.py +44 -111
- transformers/integrations/aqlm.py +3 -5
- transformers/integrations/awq.py +2 -5
- transformers/integrations/bitnet.py +5 -8
- transformers/integrations/bitsandbytes.py +16 -15
- transformers/integrations/deepspeed.py +18 -3
- transformers/integrations/eetq.py +3 -5
- transformers/integrations/fbgemm_fp8.py +1 -1
- transformers/integrations/finegrained_fp8.py +6 -16
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/higgs.py +2 -5
- transformers/integrations/hub_kernels.py +23 -5
- transformers/integrations/integration_utils.py +35 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +4 -10
- transformers/integrations/peft.py +5 -0
- transformers/integrations/quanto.py +5 -2
- transformers/integrations/spqr.py +3 -5
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/vptq.py +3 -5
- transformers/modeling_gguf_pytorch_utils.py +66 -19
- transformers/modeling_rope_utils.py +78 -81
- transformers/modeling_utils.py +583 -503
- transformers/models/__init__.py +19 -0
- transformers/models/afmoe/modeling_afmoe.py +7 -16
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/align/modeling_align.py +12 -6
- transformers/models/altclip/modeling_altclip.py +7 -3
- transformers/models/apertus/modeling_apertus.py +4 -2
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +8 -4
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +27 -0
- transformers/models/auto/feature_extraction_auto.py +7 -3
- transformers/models/auto/image_processing_auto.py +4 -2
- transformers/models/auto/modeling_auto.py +31 -0
- transformers/models/auto/processing_auto.py +4 -0
- transformers/models/auto/tokenization_auto.py +132 -153
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +18 -19
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +9 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +3 -0
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
- transformers/models/bit/modeling_bit.py +5 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blenderbot/modeling_blenderbot.py +7 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +8 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -0
- transformers/models/bloom/modeling_bloom.py +13 -44
- transformers/models/blt/modeling_blt.py +162 -2
- transformers/models/blt/modular_blt.py +168 -3
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +6 -0
- transformers/models/bros/modeling_bros.py +8 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/canine/modeling_canine.py +6 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +9 -4
- transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +25 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clipseg/modeling_clipseg.py +4 -0
- transformers/models/clvp/modeling_clvp.py +14 -3
- transformers/models/code_llama/tokenization_code_llama.py +1 -1
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
- transformers/models/convbert/modeling_convbert.py +3 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +3 -1
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +14 -2
- transformers/models/cvt/modeling_cvt.py +5 -1
- transformers/models/cwm/modeling_cwm.py +1 -1
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +46 -39
- transformers/models/d_fine/modular_d_fine.py +15 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +1 -1
- transformers/models/dac/modeling_dac.py +4 -4
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +1 -1
- transformers/models/deberta/modeling_deberta.py +2 -0
- transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
- transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
- transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +8 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +12 -1
- transformers/models/dia/modular_dia.py +11 -0
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +3 -3
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
- transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +16 -12
- transformers/models/dots1/modeling_dots1.py +14 -5
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +5 -2
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +5 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +8 -2
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt_fast.py +46 -14
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +6 -1
- transformers/models/evolla/modeling_evolla.py +9 -1
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/falcon/modeling_falcon.py +3 -3
- transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
- transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
- transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +14 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +4 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
- transformers/models/florence2/modeling_florence2.py +20 -3
- transformers/models/florence2/modular_florence2.py +13 -0
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +16 -0
- transformers/models/gemma/modeling_gemma.py +10 -12
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma2/modular_gemma2.py +1 -1
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +28 -7
- transformers/models/gemma3/modular_gemma3.py +26 -6
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +47 -9
- transformers/models/gemma3n/modular_gemma3n.py +51 -9
- transformers/models/git/modeling_git.py +181 -126
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +15 -5
- transformers/models/glm4v/modular_glm4v.py +11 -3
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
- transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +8 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
- transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
- transformers/models/gptj/modeling_gptj.py +15 -6
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +2 -3
- transformers/models/granitemoe/modular_granitemoe.py +1 -2
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
- transformers/models/groupvit/modeling_groupvit.py +6 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
- transformers/models/hubert/modeling_hubert.py +4 -0
- transformers/models/hubert/modular_hubert.py +4 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +16 -0
- transformers/models/idefics/modeling_idefics.py +10 -0
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +9 -2
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +11 -8
- transformers/models/internvl/modular_internvl.py +5 -9
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +24 -19
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +15 -7
- transformers/models/janus/modular_janus.py +16 -7
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +14 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/configuration_lasr.py +4 -0
- transformers/models/lasr/modeling_lasr.py +3 -2
- transformers/models/lasr/modular_lasr.py +8 -1
- transformers/models/lasr/processing_lasr.py +0 -2
- transformers/models/layoutlm/modeling_layoutlm.py +5 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +18 -0
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lilt/modeling_lilt.py +19 -15
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +8 -4
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
- transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
- transformers/models/longt5/modeling_longt5.py +0 -4
- transformers/models/m2m_100/modeling_m2m_100.py +10 -0
- transformers/models/mamba/modeling_mamba.py +2 -1
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +3 -0
- transformers/models/markuplm/modeling_markuplm.py +5 -8
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +9 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +9 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mimi/modeling_mimi.py +25 -4
- transformers/models/minimax/modeling_minimax.py +16 -3
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +12 -4
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +13 -2
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +4 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
- transformers/models/modernbert/modeling_modernbert.py +12 -1
- transformers/models/modernbert/modular_modernbert.py +12 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
- transformers/models/moonshine/modeling_moonshine.py +1 -1
- transformers/models/moshi/modeling_moshi.py +21 -51
- transformers/models/mpnet/modeling_mpnet.py +2 -0
- transformers/models/mra/modeling_mra.py +4 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +0 -10
- transformers/models/musicgen/modeling_musicgen.py +5 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +1 -1
- transformers/models/nemotron/modeling_nemotron.py +3 -3
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +11 -16
- transformers/models/nystromformer/modeling_nystromformer.py +7 -0
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/olmoe/modeling_olmoe.py +12 -4
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +7 -38
- transformers/models/openai/modeling_openai.py +12 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +7 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +7 -3
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/modeling_parakeet.py +5 -0
- transformers/models/parakeet/modular_parakeet.py +5 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
- transformers/models/patchtst/modeling_patchtst.py +5 -4
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/models/pe_audio/processing_pe_audio.py +24 -0
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +3 -0
- transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +5 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +12 -4
- transformers/models/phimoe/modular_phimoe.py +1 -1
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +1 -1
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +7 -0
- transformers/models/plbart/modular_plbart.py +6 -0
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +11 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prophetnet/modeling_prophetnet.py +2 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
- transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +7 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
- transformers/models/reformer/modeling_reformer.py +9 -1
- transformers/models/regnet/modeling_regnet.py +4 -0
- transformers/models/rembert/modeling_rembert.py +7 -1
- transformers/models/resnet/modeling_resnet.py +8 -3
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +4 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +1 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +5 -1
- transformers/models/sam2/modular_sam2.py +5 -1
- transformers/models/sam2_video/modeling_sam2_video.py +51 -43
- transformers/models/sam2_video/modular_sam2_video.py +31 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +23 -0
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +3 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +2 -2
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +63 -41
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
- transformers/models/speecht5/modeling_speecht5.py +28 -0
- transformers/models/splinter/modeling_splinter.py +9 -3
- transformers/models/squeezebert/modeling_squeezebert.py +2 -0
- transformers/models/stablelm/modeling_stablelm.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/swiftformer/modeling_swiftformer.py +4 -0
- transformers/models/swin/modeling_swin.py +16 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +49 -33
- transformers/models/swinv2/modeling_swinv2.py +41 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +1 -7
- transformers/models/t5gemma/modeling_t5gemma.py +1 -1
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +1 -1
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +12 -0
- transformers/models/timesfm/modular_timesfm.py +12 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
- transformers/models/trocr/modeling_trocr.py +1 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +4 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +3 -7
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +0 -6
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +7 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/visual_bert/modeling_visual_bert.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +4 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +5 -3
- transformers/models/x_clip/modeling_x_clip.py +2 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +10 -0
- transformers/models/xlm/modeling_xlm.py +13 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +4 -1
- transformers/models/zamba/modeling_zamba.py +2 -1
- transformers/models/zamba2/modeling_zamba2.py +3 -2
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +7 -0
- transformers/pipelines/__init__.py +9 -6
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/document_question_answering.py +1 -1
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +127 -56
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +9 -64
- transformers/quantizers/quantizer_aqlm.py +1 -18
- transformers/quantizers/quantizer_auto_round.py +1 -10
- transformers/quantizers/quantizer_awq.py +3 -8
- transformers/quantizers/quantizer_bitnet.py +1 -6
- transformers/quantizers/quantizer_bnb_4bit.py +9 -49
- transformers/quantizers/quantizer_bnb_8bit.py +9 -19
- transformers/quantizers/quantizer_compressed_tensors.py +1 -4
- transformers/quantizers/quantizer_eetq.py +2 -12
- transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
- transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
- transformers/quantizers/quantizer_fp_quant.py +4 -4
- transformers/quantizers/quantizer_gptq.py +1 -4
- transformers/quantizers/quantizer_higgs.py +2 -6
- transformers/quantizers/quantizer_mxfp4.py +2 -28
- transformers/quantizers/quantizer_quanto.py +14 -14
- transformers/quantizers/quantizer_spqr.py +3 -8
- transformers/quantizers/quantizer_torchao.py +28 -124
- transformers/quantizers/quantizer_vptq.py +1 -10
- transformers/testing_utils.py +28 -12
- transformers/tokenization_mistral_common.py +3 -2
- transformers/tokenization_utils_base.py +3 -2
- transformers/tokenization_utils_tokenizers.py +25 -2
- transformers/trainer.py +24 -2
- transformers/trainer_callback.py +8 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/training_args.py +8 -10
- transformers/utils/__init__.py +4 -0
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +34 -25
- transformers/utils/generic.py +20 -0
- transformers/utils/import_utils.py +51 -9
- transformers/utils/kernel_config.py +71 -18
- transformers/utils/quantization_config.py +8 -8
- transformers/video_processing_utils.py +16 -12
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -116,18 +116,17 @@ class Wav2Vec2ConformerRelPositionalEmbedding(nn.Module):
|
|
|
116
116
|
super().__init__()
|
|
117
117
|
self.max_len = config.max_source_positions
|
|
118
118
|
self.d_model = config.hidden_size
|
|
119
|
-
self.pe =
|
|
120
|
-
self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
|
|
119
|
+
self.register_buffer("pe", self.extend_pe(torch.tensor(0.0).expand(1, self.max_len)), persistent=False)
|
|
121
120
|
|
|
122
|
-
def extend_pe(self, x):
|
|
121
|
+
def extend_pe(self, x, pe=None):
|
|
123
122
|
# Reset the positional encodings
|
|
124
|
-
if
|
|
123
|
+
if pe is not None:
|
|
125
124
|
# self.pe contains both positive and negative parts
|
|
126
125
|
# the length of self.pe is 2 * input_len - 1
|
|
127
|
-
if
|
|
128
|
-
if
|
|
129
|
-
|
|
130
|
-
return
|
|
126
|
+
if pe.size(1) >= x.size(1) * 2 - 1:
|
|
127
|
+
if pe.dtype != x.dtype or pe.device != x.device:
|
|
128
|
+
pe = pe.to(dtype=x.dtype, device=x.device)
|
|
129
|
+
return pe
|
|
131
130
|
# Suppose `i` is the position of query vector and `j` is the
|
|
132
131
|
# position of key vector. We use positive relative positions when keys
|
|
133
132
|
# are to the left (i>j) and negative relative positions otherwise (i<j).
|
|
@@ -148,10 +147,10 @@ class Wav2Vec2ConformerRelPositionalEmbedding(nn.Module):
|
|
|
148
147
|
pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
|
|
149
148
|
pe_negative = pe_negative[1:].unsqueeze(0)
|
|
150
149
|
pe = torch.cat([pe_positive, pe_negative], dim=1)
|
|
151
|
-
|
|
150
|
+
return pe.to(device=x.device, dtype=x.dtype)
|
|
152
151
|
|
|
153
152
|
def forward(self, hidden_states: torch.Tensor):
|
|
154
|
-
self.extend_pe(hidden_states)
|
|
153
|
+
self.pe = self.extend_pe(hidden_states, self.pe)
|
|
155
154
|
start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
|
|
156
155
|
end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
|
|
157
156
|
relative_position_embeddings = self.pe[:, start_idx:end_idx]
|
|
@@ -584,15 +583,26 @@ class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel):
|
|
|
584
583
|
|
|
585
584
|
if module.bias is not None:
|
|
586
585
|
init.zeros_(module.bias)
|
|
587
|
-
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
|
|
586
|
+
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm1d)):
|
|
588
587
|
init.zeros_(module.bias)
|
|
589
588
|
init.ones_(module.weight)
|
|
589
|
+
if getattr(module, "running_mean", None) is not None:
|
|
590
|
+
init.zeros_(module.running_mean)
|
|
591
|
+
init.ones_(module.running_var)
|
|
592
|
+
init.zeros_(module.num_batches_tracked)
|
|
590
593
|
elif isinstance(module, nn.Conv1d):
|
|
591
594
|
init.kaiming_normal_(module.weight)
|
|
592
595
|
|
|
593
596
|
if module.bias is not None:
|
|
594
597
|
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
|
|
595
598
|
init.uniform_(module.bias, a=-k, b=k)
|
|
599
|
+
elif isinstance(module, Wav2Vec2ConformerRotaryPositionalEmbedding):
|
|
600
|
+
dim = self.config.hidden_size // self.config.num_attention_heads
|
|
601
|
+
base = self.config.rotary_embedding_base
|
|
602
|
+
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
|
|
603
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
604
|
+
elif isinstance(module, Wav2Vec2ConformerRelPositionalEmbedding):
|
|
605
|
+
init.copy_(module.pe, module.extend_pe(torch.tensor(0.0).expand(1, module.max_len)))
|
|
596
606
|
|
|
597
607
|
def _get_feat_extract_output_lengths(
|
|
598
608
|
self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
|
|
@@ -658,6 +658,7 @@ class WhisperGenerationMixin(GenerationMixin):
|
|
|
658
658
|
)
|
|
659
659
|
|
|
660
660
|
# 1. prepare generation config
|
|
661
|
+
generation_config = self.generation_config if generation_config is None else generation_config
|
|
661
662
|
generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
|
|
662
663
|
|
|
663
664
|
# 2. set global generate variables
|
|
@@ -670,7 +670,7 @@ class WhisperEncoder(WhisperPreTrainedModel):
|
|
|
670
670
|
else:
|
|
671
671
|
layer_outputs = encoder_layer(
|
|
672
672
|
hidden_states,
|
|
673
|
-
None,
|
|
673
|
+
attention_mask=None,
|
|
674
674
|
output_attentions=output_attentions,
|
|
675
675
|
)
|
|
676
676
|
|
|
@@ -866,8 +866,9 @@ class WhisperDecoder(WhisperPreTrainedModel):
|
|
|
866
866
|
|
|
867
867
|
layer_outputs = decoder_layer(
|
|
868
868
|
hidden_states,
|
|
869
|
-
|
|
870
|
-
encoder_hidden_states
|
|
869
|
+
causal_mask,
|
|
870
|
+
encoder_hidden_states,
|
|
871
|
+
encoder_attention_mask=None,
|
|
871
872
|
past_key_values=past_key_values if use_cache else None,
|
|
872
873
|
output_attentions=output_attentions,
|
|
873
874
|
use_cache=use_cache,
|
|
@@ -1247,6 +1248,7 @@ class WhisperDecoderWrapper(WhisperPreTrainedModel):
|
|
|
1247
1248
|
super().__init__(config)
|
|
1248
1249
|
config.is_encoder_decoder = False
|
|
1249
1250
|
self.decoder = WhisperDecoder(config)
|
|
1251
|
+
self.post_init()
|
|
1250
1252
|
|
|
1251
1253
|
def get_input_embeddings(self):
|
|
1252
1254
|
return self.decoder.embed_tokens
|
|
@@ -512,11 +512,13 @@ class XCLIPPreTrainedModel(PreTrainedModel):
|
|
|
512
512
|
if isinstance(module, XCLIPTextEmbeddings):
|
|
513
513
|
init.normal_(module.token_embedding.weight, mean=0.0, std=factor * 0.02)
|
|
514
514
|
init.normal_(module.position_embedding.weight, mean=0.0, std=factor * 0.02)
|
|
515
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
515
516
|
elif isinstance(module, XCLIPVisionEmbeddings):
|
|
516
517
|
factor = self.config.initializer_factor
|
|
517
518
|
init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
|
|
518
519
|
init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
|
|
519
520
|
init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
|
|
521
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
520
522
|
elif isinstance(module, XCLIPAttention):
|
|
521
523
|
factor = self.config.initializer_factor
|
|
522
524
|
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
|
|
@@ -362,6 +362,11 @@ class XcodecPreTrainedModel(PreTrainedAudioTokenizerBase):
|
|
|
362
362
|
if isinstance(submodule, nn.Conv1d):
|
|
363
363
|
init.trunc_normal_(submodule.weight, std=0.02)
|
|
364
364
|
init.constant_(submodule.bias, 0)
|
|
365
|
+
elif isinstance(module, XcodecEuclideanCodebook):
|
|
366
|
+
init.copy_(module.inited, torch.Tensor([True]))
|
|
367
|
+
init.zeros_(module.cluster_size)
|
|
368
|
+
init.zeros_(module.embed)
|
|
369
|
+
init.zeros_(module.embed_avg)
|
|
365
370
|
|
|
366
371
|
def apply_weight_norm(self):
|
|
367
372
|
"""Apply weight norm in the acoustic encoder and decoder because the original checkpoint has weight norm applied."""
|
|
@@ -20,6 +20,7 @@ from typing import Optional, Union
|
|
|
20
20
|
import torch
|
|
21
21
|
from torch import nn
|
|
22
22
|
|
|
23
|
+
from ... import initialization as init
|
|
23
24
|
from ...activations import ACT2FN
|
|
24
25
|
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
|
25
26
|
from ...generation import GenerationMixin
|
|
@@ -54,6 +55,7 @@ class XGLMSinusoidalPositionalEmbedding(nn.Module):
|
|
|
54
55
|
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
|
|
55
56
|
super().__init__()
|
|
56
57
|
self.offset = 2
|
|
58
|
+
self.num_positions = num_positions
|
|
57
59
|
self.embedding_dim = embedding_dim
|
|
58
60
|
self.padding_idx = padding_idx
|
|
59
61
|
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
|
|
@@ -361,6 +363,14 @@ class XGLMPreTrainedModel(PreTrainedModel):
|
|
|
361
363
|
supports_gradient_checkpointing = True
|
|
362
364
|
_no_split_modules = ["XGLMDecoderLayer"]
|
|
363
365
|
|
|
366
|
+
def _init_weights(self, module):
|
|
367
|
+
super()._init_weights(module)
|
|
368
|
+
if isinstance(module, XGLMSinusoidalPositionalEmbedding):
|
|
369
|
+
emb_weights = module.get_embedding(
|
|
370
|
+
module.num_positions + module.offset, module.embedding_dim, module.padding_idx
|
|
371
|
+
)
|
|
372
|
+
init.copy_(module.weights, emb_weights)
|
|
373
|
+
|
|
364
374
|
|
|
365
375
|
@auto_docstring
|
|
366
376
|
class XGLMModel(XGLMPreTrainedModel):
|
|
@@ -603,9 +603,6 @@ class XLMPreTrainedModel(PreTrainedModel):
|
|
|
603
603
|
config: XLMConfig
|
|
604
604
|
base_model_prefix = "transformer"
|
|
605
605
|
|
|
606
|
-
def __init__(self, *inputs, **kwargs):
|
|
607
|
-
super().__init__(*inputs, **kwargs)
|
|
608
|
-
|
|
609
606
|
@property
|
|
610
607
|
def dummy_inputs(self):
|
|
611
608
|
inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
|
|
@@ -633,15 +630,17 @@ class XLMPreTrainedModel(PreTrainedModel):
|
|
|
633
630
|
if isinstance(module, nn.LayerNorm):
|
|
634
631
|
init.zeros_(module.bias)
|
|
635
632
|
init.ones_(module.weight)
|
|
636
|
-
if isinstance(module, XLMModel)
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
633
|
+
if isinstance(module, XLMModel):
|
|
634
|
+
if self.config.sinusoidal_embeddings:
|
|
635
|
+
init.copy_(
|
|
636
|
+
module.position_embeddings.weight,
|
|
637
|
+
create_sinusoidal_embeddings(
|
|
638
|
+
self.config.max_position_embeddings,
|
|
639
|
+
self.config.emb_dim,
|
|
640
|
+
out=torch.empty_like(module.position_embeddings.weight),
|
|
641
|
+
),
|
|
642
|
+
)
|
|
643
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
645
644
|
|
|
646
645
|
|
|
647
646
|
@dataclass
|
|
@@ -738,10 +737,10 @@ class XLMModel(XLMPreTrainedModel):
|
|
|
738
737
|
self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
|
|
739
738
|
|
|
740
739
|
# Initialize weights and apply final processing
|
|
741
|
-
self.post_init()
|
|
742
740
|
self.register_buffer(
|
|
743
741
|
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
|
744
742
|
)
|
|
743
|
+
self.post_init()
|
|
745
744
|
|
|
746
745
|
def get_input_embeddings(self):
|
|
747
746
|
return self.embeddings
|
|
@@ -946,7 +945,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel, GenerationMixin):
|
|
|
946
945
|
def set_output_embeddings(self, new_embeddings):
|
|
947
946
|
self.pred_layer.proj = new_embeddings
|
|
948
947
|
|
|
949
|
-
def prepare_inputs_for_generation(self, input_ids, **kwargs):
|
|
948
|
+
def prepare_inputs_for_generation(self, input_ids, is_first_iteration=False, **kwargs):
|
|
950
949
|
# Overwritten -- this model uses config options to prepare inputs
|
|
951
950
|
|
|
952
951
|
mask_token_id = self.config.mask_token_id
|
|
@@ -54,6 +54,112 @@ from .configuration_xlm_roberta import XLMRobertaConfig
|
|
|
54
54
|
logger = logging.get_logger(__name__)
|
|
55
55
|
|
|
56
56
|
|
|
57
|
+
class XLMRobertaEmbeddings(nn.Module):
|
|
58
|
+
"""Construct the embeddings from word, position and token_type embeddings."""
|
|
59
|
+
|
|
60
|
+
def __init__(self, config):
|
|
61
|
+
super().__init__()
|
|
62
|
+
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
|
|
63
|
+
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
|
|
64
|
+
|
|
65
|
+
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
66
|
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
|
67
|
+
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
|
68
|
+
self.register_buffer(
|
|
69
|
+
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
|
70
|
+
)
|
|
71
|
+
self.register_buffer(
|
|
72
|
+
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
self.padding_idx = config.pad_token_id
|
|
76
|
+
self.position_embeddings = nn.Embedding(
|
|
77
|
+
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def forward(
|
|
81
|
+
self,
|
|
82
|
+
input_ids: Optional[torch.LongTensor] = None,
|
|
83
|
+
token_type_ids: Optional[torch.LongTensor] = None,
|
|
84
|
+
position_ids: Optional[torch.LongTensor] = None,
|
|
85
|
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
86
|
+
past_key_values_length: int = 0,
|
|
87
|
+
) -> torch.Tensor:
|
|
88
|
+
if position_ids is None:
|
|
89
|
+
if input_ids is not None:
|
|
90
|
+
# Create the position ids from the input token ids. Any padded tokens remain padded.
|
|
91
|
+
position_ids = self.create_position_ids_from_input_ids(
|
|
92
|
+
input_ids, self.padding_idx, past_key_values_length
|
|
93
|
+
)
|
|
94
|
+
else:
|
|
95
|
+
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, self.padding_idx)
|
|
96
|
+
|
|
97
|
+
if input_ids is not None:
|
|
98
|
+
input_shape = input_ids.size()
|
|
99
|
+
else:
|
|
100
|
+
input_shape = inputs_embeds.size()[:-1]
|
|
101
|
+
|
|
102
|
+
batch_size, seq_length = input_shape
|
|
103
|
+
|
|
104
|
+
# Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
|
|
105
|
+
# when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
|
|
106
|
+
# issue #5664
|
|
107
|
+
if token_type_ids is None:
|
|
108
|
+
if hasattr(self, "token_type_ids"):
|
|
109
|
+
# NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
|
|
110
|
+
buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
|
|
111
|
+
buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
|
|
112
|
+
token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
|
|
113
|
+
else:
|
|
114
|
+
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
|
|
115
|
+
|
|
116
|
+
if inputs_embeds is None:
|
|
117
|
+
inputs_embeds = self.word_embeddings(input_ids)
|
|
118
|
+
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
|
119
|
+
embeddings = inputs_embeds + token_type_embeddings
|
|
120
|
+
|
|
121
|
+
position_embeddings = self.position_embeddings(position_ids)
|
|
122
|
+
embeddings = embeddings + position_embeddings
|
|
123
|
+
|
|
124
|
+
embeddings = self.LayerNorm(embeddings)
|
|
125
|
+
embeddings = self.dropout(embeddings)
|
|
126
|
+
return embeddings
|
|
127
|
+
|
|
128
|
+
@staticmethod
|
|
129
|
+
def create_position_ids_from_inputs_embeds(inputs_embeds, padding_idx):
|
|
130
|
+
"""
|
|
131
|
+
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
inputs_embeds: torch.Tensor
|
|
135
|
+
|
|
136
|
+
Returns: torch.Tensor
|
|
137
|
+
"""
|
|
138
|
+
input_shape = inputs_embeds.size()[:-1]
|
|
139
|
+
sequence_length = input_shape[1]
|
|
140
|
+
|
|
141
|
+
position_ids = torch.arange(
|
|
142
|
+
padding_idx + 1, sequence_length + padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
|
|
143
|
+
)
|
|
144
|
+
return position_ids.unsqueeze(0).expand(input_shape)
|
|
145
|
+
|
|
146
|
+
@staticmethod
|
|
147
|
+
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
|
|
148
|
+
"""
|
|
149
|
+
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
|
|
150
|
+
are ignored. This is modified from fairseq's `utils.make_positions`.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
x: torch.Tensor x:
|
|
154
|
+
|
|
155
|
+
Returns: torch.Tensor
|
|
156
|
+
"""
|
|
157
|
+
# The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
|
|
158
|
+
mask = input_ids.ne(padding_idx).int()
|
|
159
|
+
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
|
|
160
|
+
return incremental_indices.long() + padding_idx
|
|
161
|
+
|
|
162
|
+
|
|
57
163
|
def eager_attention_forward(
|
|
58
164
|
module: nn.Module,
|
|
59
165
|
query: torch.Tensor,
|
|
@@ -417,112 +523,9 @@ class XLMRobertaPreTrainedModel(PreTrainedModel):
|
|
|
417
523
|
super()._init_weights(module)
|
|
418
524
|
if isinstance(module, XLMRobertaLMHead):
|
|
419
525
|
init.zeros_(module.bias)
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
"""Construct the embeddings from word, position and token_type embeddings."""
|
|
424
|
-
|
|
425
|
-
def __init__(self, config):
|
|
426
|
-
super().__init__()
|
|
427
|
-
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
|
|
428
|
-
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
|
|
429
|
-
|
|
430
|
-
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
431
|
-
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
|
432
|
-
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
|
433
|
-
self.register_buffer(
|
|
434
|
-
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
|
435
|
-
)
|
|
436
|
-
self.register_buffer(
|
|
437
|
-
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
|
438
|
-
)
|
|
439
|
-
|
|
440
|
-
self.padding_idx = config.pad_token_id
|
|
441
|
-
self.position_embeddings = nn.Embedding(
|
|
442
|
-
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
|
|
443
|
-
)
|
|
444
|
-
|
|
445
|
-
def forward(
|
|
446
|
-
self,
|
|
447
|
-
input_ids: Optional[torch.LongTensor] = None,
|
|
448
|
-
token_type_ids: Optional[torch.LongTensor] = None,
|
|
449
|
-
position_ids: Optional[torch.LongTensor] = None,
|
|
450
|
-
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
451
|
-
past_key_values_length: int = 0,
|
|
452
|
-
) -> torch.Tensor:
|
|
453
|
-
if position_ids is None:
|
|
454
|
-
if input_ids is not None:
|
|
455
|
-
# Create the position ids from the input token ids. Any padded tokens remain padded.
|
|
456
|
-
position_ids = self.create_position_ids_from_input_ids(
|
|
457
|
-
input_ids, self.padding_idx, past_key_values_length
|
|
458
|
-
)
|
|
459
|
-
else:
|
|
460
|
-
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, self.padding_idx)
|
|
461
|
-
|
|
462
|
-
if input_ids is not None:
|
|
463
|
-
input_shape = input_ids.size()
|
|
464
|
-
else:
|
|
465
|
-
input_shape = inputs_embeds.size()[:-1]
|
|
466
|
-
|
|
467
|
-
batch_size, seq_length = input_shape
|
|
468
|
-
|
|
469
|
-
# Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
|
|
470
|
-
# when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
|
|
471
|
-
# issue #5664
|
|
472
|
-
if token_type_ids is None:
|
|
473
|
-
if hasattr(self, "token_type_ids"):
|
|
474
|
-
# NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
|
|
475
|
-
buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
|
|
476
|
-
buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
|
|
477
|
-
token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
|
|
478
|
-
else:
|
|
479
|
-
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
|
|
480
|
-
|
|
481
|
-
if inputs_embeds is None:
|
|
482
|
-
inputs_embeds = self.word_embeddings(input_ids)
|
|
483
|
-
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
|
484
|
-
embeddings = inputs_embeds + token_type_embeddings
|
|
485
|
-
|
|
486
|
-
position_embeddings = self.position_embeddings(position_ids)
|
|
487
|
-
embeddings = embeddings + position_embeddings
|
|
488
|
-
|
|
489
|
-
embeddings = self.LayerNorm(embeddings)
|
|
490
|
-
embeddings = self.dropout(embeddings)
|
|
491
|
-
return embeddings
|
|
492
|
-
|
|
493
|
-
@staticmethod
|
|
494
|
-
def create_position_ids_from_inputs_embeds(inputs_embeds, padding_idx):
|
|
495
|
-
"""
|
|
496
|
-
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
|
|
497
|
-
|
|
498
|
-
Args:
|
|
499
|
-
inputs_embeds: torch.Tensor
|
|
500
|
-
|
|
501
|
-
Returns: torch.Tensor
|
|
502
|
-
"""
|
|
503
|
-
input_shape = inputs_embeds.size()[:-1]
|
|
504
|
-
sequence_length = input_shape[1]
|
|
505
|
-
|
|
506
|
-
position_ids = torch.arange(
|
|
507
|
-
padding_idx + 1, sequence_length + padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
|
|
508
|
-
)
|
|
509
|
-
return position_ids.unsqueeze(0).expand(input_shape)
|
|
510
|
-
|
|
511
|
-
@staticmethod
|
|
512
|
-
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
|
|
513
|
-
"""
|
|
514
|
-
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
|
|
515
|
-
are ignored. This is modified from fairseq's `utils.make_positions`.
|
|
516
|
-
|
|
517
|
-
Args:
|
|
518
|
-
x: torch.Tensor x:
|
|
519
|
-
|
|
520
|
-
Returns: torch.Tensor
|
|
521
|
-
"""
|
|
522
|
-
# The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
|
|
523
|
-
mask = input_ids.ne(padding_idx).int()
|
|
524
|
-
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
|
|
525
|
-
return incremental_indices.long() + padding_idx
|
|
526
|
+
elif isinstance(module, XLMRobertaEmbeddings):
|
|
527
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
528
|
+
init.zeros_(module.token_type_ids)
|
|
526
529
|
|
|
527
530
|
|
|
528
531
|
class XLMRobertaEncoder(nn.Module):
|
|
@@ -542,6 +542,9 @@ class XLMRobertaXLPreTrainedModel(PreTrainedModel):
|
|
|
542
542
|
super()._init_weights(module)
|
|
543
543
|
if isinstance(module, XLMRobertaXLLMHead):
|
|
544
544
|
init.zeros_(module.bias)
|
|
545
|
+
elif isinstance(module, XLMRobertaXLEmbeddings):
|
|
546
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
547
|
+
init.zeros_(module.token_type_ids)
|
|
545
548
|
|
|
546
549
|
|
|
547
550
|
class XLMRobertaXLPooler(nn.Module):
|
|
@@ -1244,7 +1244,9 @@ class XLNetLMHeadModel(XLNetPreTrainedModel, GenerationMixin):
|
|
|
1244
1244
|
def set_output_embeddings(self, new_embeddings):
|
|
1245
1245
|
self.lm_loss = new_embeddings
|
|
1246
1246
|
|
|
1247
|
-
def prepare_inputs_for_generation(
|
|
1247
|
+
def prepare_inputs_for_generation(
|
|
1248
|
+
self, input_ids, past_key_values=None, use_mems=None, is_first_iteration=False, **kwargs
|
|
1249
|
+
):
|
|
1248
1250
|
# Overwritten -- this model has unique input preparation
|
|
1249
1251
|
|
|
1250
1252
|
# Add dummy token at the end (no attention on this one)
|
|
@@ -634,6 +634,9 @@ class XmodPreTrainedModel(PreTrainedModel):
|
|
|
634
634
|
super()._init_weights(module)
|
|
635
635
|
if isinstance(module, XmodLMHead):
|
|
636
636
|
init.zeros_(module.bias)
|
|
637
|
+
elif isinstance(module, XmodEmbeddings):
|
|
638
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
639
|
+
init.zeros_(module.token_type_ids)
|
|
637
640
|
|
|
638
641
|
def set_default_language(self, language: str):
|
|
639
642
|
"""
|
|
@@ -54,7 +54,7 @@ def load_cuda_kernels():
|
|
|
54
54
|
global lsh_cumulation
|
|
55
55
|
if not is_kernels_available():
|
|
56
56
|
raise ImportError("kernels is not installed, please install it with `pip install kernels`")
|
|
57
|
-
from
|
|
57
|
+
from ...integrations.hub_kernels import get_kernel
|
|
58
58
|
|
|
59
59
|
yoso = get_kernel("kernels-community/yoso")
|
|
60
60
|
lsh_cumulation = yoso.lsh_cumulation
|
|
@@ -611,6 +611,9 @@ class YosoPreTrainedModel(PreTrainedModel):
|
|
|
611
611
|
super()._init_weights(module)
|
|
612
612
|
if isinstance(module, YosoLMPredictionHead):
|
|
613
613
|
init.zeros_(module.bias)
|
|
614
|
+
elif isinstance(module, YosoEmbeddings):
|
|
615
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)) + 2)
|
|
616
|
+
init.zeros_(module.token_type_ids)
|
|
614
617
|
|
|
615
618
|
|
|
616
619
|
@auto_docstring
|
|
@@ -1099,6 +1099,7 @@ class ZambaForCausalLM(ZambaPreTrainedModel, GenerationMixin):
|
|
|
1099
1099
|
cache_position=None,
|
|
1100
1100
|
position_ids=None,
|
|
1101
1101
|
use_cache=True,
|
|
1102
|
+
is_first_iteration=False,
|
|
1102
1103
|
**kwargs,
|
|
1103
1104
|
):
|
|
1104
1105
|
# Overwritten -- has a unique cache type, `ZambaHybridDynamicCache`
|
|
@@ -1132,7 +1133,7 @@ class ZambaForCausalLM(ZambaPreTrainedModel, GenerationMixin):
|
|
|
1132
1133
|
position_ids = position_ids[:, -input_ids.shape[1] :]
|
|
1133
1134
|
|
|
1134
1135
|
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
|
1135
|
-
if inputs_embeds is not None and
|
|
1136
|
+
if inputs_embeds is not None and is_first_iteration:
|
|
1136
1137
|
model_inputs = {"inputs_embeds": inputs_embeds}
|
|
1137
1138
|
else:
|
|
1138
1139
|
model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases
|
|
@@ -225,7 +225,7 @@ class Zamba2RotaryEmbedding(nn.Module):
|
|
|
225
225
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
226
226
|
|
|
227
227
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
228
|
-
self.original_inv_freq =
|
|
228
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
229
229
|
|
|
230
230
|
@staticmethod
|
|
231
231
|
def compute_default_rope_parameters(
|
|
@@ -1545,6 +1545,7 @@ class Zamba2ForCausalLM(Zamba2PreTrainedModel, GenerationMixin):
|
|
|
1545
1545
|
cache_position=None,
|
|
1546
1546
|
position_ids=None,
|
|
1547
1547
|
use_cache=True,
|
|
1548
|
+
is_first_iteration=False,
|
|
1548
1549
|
**kwargs,
|
|
1549
1550
|
):
|
|
1550
1551
|
# Overwritten -- has a unique cache type, `Zamba2HybridDynamicCache`
|
|
@@ -1578,7 +1579,7 @@ class Zamba2ForCausalLM(Zamba2PreTrainedModel, GenerationMixin):
|
|
|
1578
1579
|
position_ids = position_ids[:, -input_ids.shape[1] :]
|
|
1579
1580
|
|
|
1580
1581
|
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
|
1581
|
-
if inputs_embeds is not None and
|
|
1582
|
+
if inputs_embeds is not None and is_first_iteration:
|
|
1582
1583
|
model_inputs = {"inputs_embeds": inputs_embeds}
|
|
1583
1584
|
else:
|
|
1584
1585
|
model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases
|
|
@@ -37,7 +37,7 @@ class ZoeDepthConfig(PreTrainedConfig):
|
|
|
37
37
|
documentation from [`PreTrainedConfig`] for more information.
|
|
38
38
|
|
|
39
39
|
Args:
|
|
40
|
-
backbone_config (`Union[dict
|
|
40
|
+
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `BeitConfig()`):
|
|
41
41
|
The configuration of the backbone model.
|
|
42
42
|
backbone (`str`, *optional*):
|
|
43
43
|
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
|
|
@@ -171,9 +171,7 @@ class ZoeDepthImageProcessorFast(BaseImageProcessorFast):
|
|
|
171
171
|
if do_normalize:
|
|
172
172
|
stacked_images = self.normalize(stacked_images, image_mean, image_std)
|
|
173
173
|
resized_images_grouped[shape] = stacked_images
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
processed_images = torch.stack(resized_images, dim=0) if return_tensors else resized_images
|
|
174
|
+
processed_images = reorder_images(resized_images_grouped, grouped_images_index)
|
|
177
175
|
|
|
178
176
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
179
177
|
|
|
@@ -21,6 +21,7 @@ from typing import Optional, Union
|
|
|
21
21
|
import torch
|
|
22
22
|
from torch import nn
|
|
23
23
|
|
|
24
|
+
from ... import initialization as init
|
|
24
25
|
from ...activations import ACT2FN
|
|
25
26
|
from ...modeling_outputs import DepthEstimatorOutput
|
|
26
27
|
from ...modeling_utils import PreTrainedModel
|
|
@@ -1211,6 +1212,12 @@ class ZoeDepthPreTrainedModel(PreTrainedModel):
|
|
|
1211
1212
|
input_modalities = ("image",)
|
|
1212
1213
|
supports_gradient_checkpointing = True
|
|
1213
1214
|
|
|
1215
|
+
def _init_weights(self, module):
|
|
1216
|
+
super()._init_weights(module)
|
|
1217
|
+
if isinstance(module, LogBinomialSoftmax):
|
|
1218
|
+
init.copy_(module.k_idx, torch.arange(0, module.k).view(1, -1, 1, 1))
|
|
1219
|
+
init.copy_(module.k_minus_1, torch.tensor([module.k - 1]).view(1, -1, 1, 1))
|
|
1220
|
+
|
|
1214
1221
|
|
|
1215
1222
|
@auto_docstring(
|
|
1216
1223
|
custom_intro="""
|
|
@@ -22,7 +22,7 @@ from huggingface_hub import is_offline_mode, model_info
|
|
|
22
22
|
|
|
23
23
|
from ..configuration_utils import PreTrainedConfig
|
|
24
24
|
from ..dynamic_module_utils import get_class_from_dynamic_module
|
|
25
|
-
from ..feature_extraction_utils import PreTrainedFeatureExtractor
|
|
25
|
+
from ..feature_extraction_utils import FeatureExtractionMixin, PreTrainedFeatureExtractor
|
|
26
26
|
from ..image_processing_utils import BaseImageProcessor
|
|
27
27
|
from ..models.auto.configuration_auto import AutoConfig
|
|
28
28
|
from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
|
|
@@ -700,12 +700,14 @@ def pipeline(
|
|
|
700
700
|
|
|
701
701
|
code_revision = kwargs.pop("code_revision", None)
|
|
702
702
|
commit_hash = kwargs.pop("_commit_hash", None)
|
|
703
|
+
local_files_only = kwargs.get("local_files_only", False)
|
|
703
704
|
|
|
704
705
|
hub_kwargs = {
|
|
705
706
|
"revision": revision,
|
|
706
707
|
"token": token,
|
|
707
708
|
"trust_remote_code": trust_remote_code,
|
|
708
709
|
"_commit_hash": commit_hash,
|
|
710
|
+
"local_files_only": local_files_only,
|
|
709
711
|
}
|
|
710
712
|
|
|
711
713
|
if task is None and model is None:
|
|
@@ -986,12 +988,13 @@ def pipeline(
|
|
|
986
988
|
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
|
987
989
|
feature_extractor, _from_pipeline=task, **hub_kwargs, **model_kwargs
|
|
988
990
|
)
|
|
991
|
+
config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(
|
|
992
|
+
pretrained_model_name_or_path or model_name,
|
|
993
|
+
**hub_kwargs,
|
|
994
|
+
)
|
|
995
|
+
processor_class = config_dict.get("processor_class", None)
|
|
989
996
|
|
|
990
|
-
if (
|
|
991
|
-
feature_extractor._processor_class
|
|
992
|
-
and feature_extractor._processor_class.endswith("WithLM")
|
|
993
|
-
and isinstance(model_name, str)
|
|
994
|
-
):
|
|
997
|
+
if processor_class is not None and processor_class.endswith("WithLM") and isinstance(model_name, str):
|
|
995
998
|
try:
|
|
996
999
|
import kenlm # to trigger `ImportError` if not installed
|
|
997
1000
|
from pyctcdecode import BeamSearchDecoderCTC
|