transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +20 -1
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +68 -5
- transformers/core_model_loading.py +201 -35
- transformers/dependency_versions_table.py +1 -1
- transformers/feature_extraction_utils.py +54 -22
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +162 -122
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +101 -64
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +2 -12
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +12 -0
- transformers/integrations/accelerate.py +44 -111
- transformers/integrations/aqlm.py +3 -5
- transformers/integrations/awq.py +2 -5
- transformers/integrations/bitnet.py +5 -8
- transformers/integrations/bitsandbytes.py +16 -15
- transformers/integrations/deepspeed.py +18 -3
- transformers/integrations/eetq.py +3 -5
- transformers/integrations/fbgemm_fp8.py +1 -1
- transformers/integrations/finegrained_fp8.py +6 -16
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/higgs.py +2 -5
- transformers/integrations/hub_kernels.py +23 -5
- transformers/integrations/integration_utils.py +35 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +4 -10
- transformers/integrations/peft.py +5 -0
- transformers/integrations/quanto.py +5 -2
- transformers/integrations/spqr.py +3 -5
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/vptq.py +3 -5
- transformers/modeling_gguf_pytorch_utils.py +66 -19
- transformers/modeling_rope_utils.py +78 -81
- transformers/modeling_utils.py +583 -503
- transformers/models/__init__.py +19 -0
- transformers/models/afmoe/modeling_afmoe.py +7 -16
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/align/modeling_align.py +12 -6
- transformers/models/altclip/modeling_altclip.py +7 -3
- transformers/models/apertus/modeling_apertus.py +4 -2
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +8 -4
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +27 -0
- transformers/models/auto/feature_extraction_auto.py +7 -3
- transformers/models/auto/image_processing_auto.py +4 -2
- transformers/models/auto/modeling_auto.py +31 -0
- transformers/models/auto/processing_auto.py +4 -0
- transformers/models/auto/tokenization_auto.py +132 -153
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +18 -19
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +9 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +3 -0
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
- transformers/models/bit/modeling_bit.py +5 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blenderbot/modeling_blenderbot.py +7 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +8 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -0
- transformers/models/bloom/modeling_bloom.py +13 -44
- transformers/models/blt/modeling_blt.py +162 -2
- transformers/models/blt/modular_blt.py +168 -3
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +6 -0
- transformers/models/bros/modeling_bros.py +8 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/canine/modeling_canine.py +6 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +9 -4
- transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +25 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clipseg/modeling_clipseg.py +4 -0
- transformers/models/clvp/modeling_clvp.py +14 -3
- transformers/models/code_llama/tokenization_code_llama.py +1 -1
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
- transformers/models/convbert/modeling_convbert.py +3 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +3 -1
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +14 -2
- transformers/models/cvt/modeling_cvt.py +5 -1
- transformers/models/cwm/modeling_cwm.py +1 -1
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +46 -39
- transformers/models/d_fine/modular_d_fine.py +15 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +1 -1
- transformers/models/dac/modeling_dac.py +4 -4
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +1 -1
- transformers/models/deberta/modeling_deberta.py +2 -0
- transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
- transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
- transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +8 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +12 -1
- transformers/models/dia/modular_dia.py +11 -0
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +3 -3
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
- transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +16 -12
- transformers/models/dots1/modeling_dots1.py +14 -5
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +5 -2
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +5 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +8 -2
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt_fast.py +46 -14
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +6 -1
- transformers/models/evolla/modeling_evolla.py +9 -1
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/falcon/modeling_falcon.py +3 -3
- transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
- transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
- transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +14 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +4 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
- transformers/models/florence2/modeling_florence2.py +20 -3
- transformers/models/florence2/modular_florence2.py +13 -0
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +16 -0
- transformers/models/gemma/modeling_gemma.py +10 -12
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma2/modular_gemma2.py +1 -1
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +28 -7
- transformers/models/gemma3/modular_gemma3.py +26 -6
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +47 -9
- transformers/models/gemma3n/modular_gemma3n.py +51 -9
- transformers/models/git/modeling_git.py +181 -126
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +15 -5
- transformers/models/glm4v/modular_glm4v.py +11 -3
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
- transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +8 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
- transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
- transformers/models/gptj/modeling_gptj.py +15 -6
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +2 -3
- transformers/models/granitemoe/modular_granitemoe.py +1 -2
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
- transformers/models/groupvit/modeling_groupvit.py +6 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
- transformers/models/hubert/modeling_hubert.py +4 -0
- transformers/models/hubert/modular_hubert.py +4 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +16 -0
- transformers/models/idefics/modeling_idefics.py +10 -0
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +9 -2
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +11 -8
- transformers/models/internvl/modular_internvl.py +5 -9
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +24 -19
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +15 -7
- transformers/models/janus/modular_janus.py +16 -7
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +14 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/configuration_lasr.py +4 -0
- transformers/models/lasr/modeling_lasr.py +3 -2
- transformers/models/lasr/modular_lasr.py +8 -1
- transformers/models/lasr/processing_lasr.py +0 -2
- transformers/models/layoutlm/modeling_layoutlm.py +5 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +18 -0
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lilt/modeling_lilt.py +19 -15
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +8 -4
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
- transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
- transformers/models/longt5/modeling_longt5.py +0 -4
- transformers/models/m2m_100/modeling_m2m_100.py +10 -0
- transformers/models/mamba/modeling_mamba.py +2 -1
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +3 -0
- transformers/models/markuplm/modeling_markuplm.py +5 -8
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +9 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +9 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mimi/modeling_mimi.py +25 -4
- transformers/models/minimax/modeling_minimax.py +16 -3
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +12 -4
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +13 -2
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +4 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
- transformers/models/modernbert/modeling_modernbert.py +12 -1
- transformers/models/modernbert/modular_modernbert.py +12 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
- transformers/models/moonshine/modeling_moonshine.py +1 -1
- transformers/models/moshi/modeling_moshi.py +21 -51
- transformers/models/mpnet/modeling_mpnet.py +2 -0
- transformers/models/mra/modeling_mra.py +4 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +0 -10
- transformers/models/musicgen/modeling_musicgen.py +5 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +1 -1
- transformers/models/nemotron/modeling_nemotron.py +3 -3
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +11 -16
- transformers/models/nystromformer/modeling_nystromformer.py +7 -0
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/olmoe/modeling_olmoe.py +12 -4
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +7 -38
- transformers/models/openai/modeling_openai.py +12 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +7 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +7 -3
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/modeling_parakeet.py +5 -0
- transformers/models/parakeet/modular_parakeet.py +5 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
- transformers/models/patchtst/modeling_patchtst.py +5 -4
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/models/pe_audio/processing_pe_audio.py +24 -0
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +3 -0
- transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +5 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +12 -4
- transformers/models/phimoe/modular_phimoe.py +1 -1
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +1 -1
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +7 -0
- transformers/models/plbart/modular_plbart.py +6 -0
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +11 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prophetnet/modeling_prophetnet.py +2 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
- transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +7 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
- transformers/models/reformer/modeling_reformer.py +9 -1
- transformers/models/regnet/modeling_regnet.py +4 -0
- transformers/models/rembert/modeling_rembert.py +7 -1
- transformers/models/resnet/modeling_resnet.py +8 -3
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +4 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +1 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +5 -1
- transformers/models/sam2/modular_sam2.py +5 -1
- transformers/models/sam2_video/modeling_sam2_video.py +51 -43
- transformers/models/sam2_video/modular_sam2_video.py +31 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +23 -0
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +3 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +2 -2
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +63 -41
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
- transformers/models/speecht5/modeling_speecht5.py +28 -0
- transformers/models/splinter/modeling_splinter.py +9 -3
- transformers/models/squeezebert/modeling_squeezebert.py +2 -0
- transformers/models/stablelm/modeling_stablelm.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/swiftformer/modeling_swiftformer.py +4 -0
- transformers/models/swin/modeling_swin.py +16 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +49 -33
- transformers/models/swinv2/modeling_swinv2.py +41 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +1 -7
- transformers/models/t5gemma/modeling_t5gemma.py +1 -1
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +1 -1
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +12 -0
- transformers/models/timesfm/modular_timesfm.py +12 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
- transformers/models/trocr/modeling_trocr.py +1 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +4 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +3 -7
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +0 -6
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +7 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/visual_bert/modeling_visual_bert.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +4 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +5 -3
- transformers/models/x_clip/modeling_x_clip.py +2 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +10 -0
- transformers/models/xlm/modeling_xlm.py +13 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +4 -1
- transformers/models/zamba/modeling_zamba.py +2 -1
- transformers/models/zamba2/modeling_zamba2.py +3 -2
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +7 -0
- transformers/pipelines/__init__.py +9 -6
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/document_question_answering.py +1 -1
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +127 -56
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +9 -64
- transformers/quantizers/quantizer_aqlm.py +1 -18
- transformers/quantizers/quantizer_auto_round.py +1 -10
- transformers/quantizers/quantizer_awq.py +3 -8
- transformers/quantizers/quantizer_bitnet.py +1 -6
- transformers/quantizers/quantizer_bnb_4bit.py +9 -49
- transformers/quantizers/quantizer_bnb_8bit.py +9 -19
- transformers/quantizers/quantizer_compressed_tensors.py +1 -4
- transformers/quantizers/quantizer_eetq.py +2 -12
- transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
- transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
- transformers/quantizers/quantizer_fp_quant.py +4 -4
- transformers/quantizers/quantizer_gptq.py +1 -4
- transformers/quantizers/quantizer_higgs.py +2 -6
- transformers/quantizers/quantizer_mxfp4.py +2 -28
- transformers/quantizers/quantizer_quanto.py +14 -14
- transformers/quantizers/quantizer_spqr.py +3 -8
- transformers/quantizers/quantizer_torchao.py +28 -124
- transformers/quantizers/quantizer_vptq.py +1 -10
- transformers/testing_utils.py +28 -12
- transformers/tokenization_mistral_common.py +3 -2
- transformers/tokenization_utils_base.py +3 -2
- transformers/tokenization_utils_tokenizers.py +25 -2
- transformers/trainer.py +24 -2
- transformers/trainer_callback.py +8 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/training_args.py +8 -10
- transformers/utils/__init__.py +4 -0
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +34 -25
- transformers/utils/generic.py +20 -0
- transformers/utils/import_utils.py +51 -9
- transformers/utils/kernel_config.py +71 -18
- transformers/utils/quantization_config.py +8 -8
- transformers/video_processing_utils.py +16 -12
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -79,7 +79,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
|
|
|
79
79
|
size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
|
|
80
80
|
Controls the size of the output image after resizing. Can be overridden by the `size` parameter in the
|
|
81
81
|
`preprocess` method.
|
|
82
|
-
resample (`PILImageResampling`, *optional*, defaults to `Resampling.
|
|
82
|
+
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
|
83
83
|
Defines the resampling filter to use if resizing the image. Can be overridden by the `resample` parameter
|
|
84
84
|
in the `preprocess` method.
|
|
85
85
|
do_rescale (`bool`, *optional*, defaults to `True`):
|
|
@@ -112,7 +112,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
|
|
|
112
112
|
self,
|
|
113
113
|
do_resize: bool = True,
|
|
114
114
|
size: Optional[dict[str, int]] = None,
|
|
115
|
-
resample: PILImageResampling = PILImageResampling.
|
|
115
|
+
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
|
116
116
|
do_rescale: bool = True,
|
|
117
117
|
rescale_factor: Union[int, float] = 1 / 255,
|
|
118
118
|
do_center_crop: bool = True,
|
|
@@ -137,12 +137,12 @@ class MobileViTImageProcessor(BaseImageProcessor):
|
|
|
137
137
|
self.do_flip_channel_order = do_flip_channel_order
|
|
138
138
|
self.do_reduce_labels = do_reduce_labels
|
|
139
139
|
|
|
140
|
-
# Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize
|
|
140
|
+
# Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize
|
|
141
141
|
def resize(
|
|
142
142
|
self,
|
|
143
143
|
image: np.ndarray,
|
|
144
144
|
size: dict[str, int],
|
|
145
|
-
resample: PILImageResampling = PILImageResampling.
|
|
145
|
+
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
|
146
146
|
data_format: Optional[Union[str, ChannelDimension]] = None,
|
|
147
147
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
|
148
148
|
**kwargs,
|
|
@@ -156,7 +156,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
|
|
|
156
156
|
Image to resize.
|
|
157
157
|
size (`dict[str, int]`):
|
|
158
158
|
Size of the output image.
|
|
159
|
-
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.
|
|
159
|
+
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
|
|
160
160
|
Resampling filter to use when resiizing the image.
|
|
161
161
|
data_format (`str` or `ChannelDimension`, *optional*):
|
|
162
162
|
The channel dimension format of the image. If not provided, it will be the same as the input image.
|
|
@@ -42,7 +42,7 @@ from .image_processing_mobilevit import MobileVitImageProcessorKwargs
|
|
|
42
42
|
|
|
43
43
|
@auto_docstring
|
|
44
44
|
class MobileViTImageProcessorFast(BaseImageProcessorFast):
|
|
45
|
-
resample = PILImageResampling.
|
|
45
|
+
resample = PILImageResampling.BICUBIC
|
|
46
46
|
size = {"shortest_edge": 224}
|
|
47
47
|
default_to_square = False
|
|
48
48
|
crop_size = {"height": 256, "width": 256}
|
|
@@ -182,7 +182,6 @@ class MobileViTImageProcessorFast(BaseImageProcessorFast):
|
|
|
182
182
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
183
183
|
|
|
184
184
|
# Stack all processed images if return_tensors is specified
|
|
185
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
186
185
|
|
|
187
186
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
188
187
|
|
|
@@ -615,6 +615,10 @@ class MobileViTPreTrainedModel(PreTrainedModel):
|
|
|
615
615
|
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
|
|
616
616
|
if module.bias is not None:
|
|
617
617
|
init.zeros_(module.bias)
|
|
618
|
+
if getattr(module, "running_mean", None) is not None:
|
|
619
|
+
init.zeros_(module.running_mean)
|
|
620
|
+
init.ones_(module.running_var)
|
|
621
|
+
init.zeros_(module.num_batches_tracked)
|
|
618
622
|
elif isinstance(module, nn.LayerNorm):
|
|
619
623
|
init.zeros_(module.bias)
|
|
620
624
|
init.ones_(module.weight)
|
|
@@ -582,6 +582,10 @@ class MobileViTV2PreTrainedModel(PreTrainedModel):
|
|
|
582
582
|
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
|
|
583
583
|
if module.bias is not None:
|
|
584
584
|
init.zeros_(module.bias)
|
|
585
|
+
if getattr(module, "running_mean", None) is not None:
|
|
586
|
+
init.zeros_(module.running_mean)
|
|
587
|
+
init.ones_(module.running_var)
|
|
588
|
+
init.zeros_(module.num_batches_tracked)
|
|
585
589
|
elif isinstance(module, nn.GroupNorm):
|
|
586
590
|
init.zeros_(module.bias)
|
|
587
591
|
init.ones_(module.weight)
|
|
@@ -268,7 +268,7 @@ class ModernBertRotaryEmbedding(nn.Module):
|
|
|
268
268
|
rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type[layer_type]]
|
|
269
269
|
curr_inv_freq, curr_attention_scaling = rope_init_fn(self.config, device, layer_type=layer_type)
|
|
270
270
|
self.register_buffer(f"{layer_type}_inv_freq", curr_inv_freq, persistent=False)
|
|
271
|
-
|
|
271
|
+
self.register_buffer(f"{layer_type}_original_inv_freq", curr_inv_freq.clone(), persistent=False)
|
|
272
272
|
setattr(self, f"{layer_type}_attention_scaling", curr_attention_scaling)
|
|
273
273
|
|
|
274
274
|
@staticmethod
|
|
@@ -677,6 +677,17 @@ class ModernBertPreTrainedModel(PreTrainedModel):
|
|
|
677
677
|
init.ones_(module.weight)
|
|
678
678
|
if module.bias is not None:
|
|
679
679
|
init.zeros_(module.bias)
|
|
680
|
+
elif isinstance(module, ModernBertRotaryEmbedding):
|
|
681
|
+
for layer_type in module.layer_types:
|
|
682
|
+
rope_init_fn = module.compute_default_rope_parameters
|
|
683
|
+
if module.rope_type[layer_type] != "default":
|
|
684
|
+
rope_init_fn = ROPE_INIT_FUNCTIONS[module.rope_type[layer_type]]
|
|
685
|
+
curr_inv_freq, _ = rope_init_fn(module.config, layer_type=layer_type)
|
|
686
|
+
init.copy_(getattr(module, f"{layer_type}_inv_freq"), curr_inv_freq)
|
|
687
|
+
init.copy_(getattr(module, f"{layer_type}_original_inv_freq"), curr_inv_freq)
|
|
688
|
+
elif isinstance(module, ModernBertUnpaddedRotaryEmbedding):
|
|
689
|
+
inv_freq = module._compute_inv_freq()
|
|
690
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
680
691
|
|
|
681
692
|
def _check_and_adjust_attn_implementation(
|
|
682
693
|
self, attn_implementation: Optional[str], is_init_check: bool = False
|
|
@@ -35,7 +35,7 @@ from ...modeling_outputs import (
|
|
|
35
35
|
SequenceClassifierOutput,
|
|
36
36
|
TokenClassifierOutput,
|
|
37
37
|
)
|
|
38
|
-
from ...modeling_rope_utils import RopeParameters
|
|
38
|
+
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, RopeParameters
|
|
39
39
|
from ...modeling_utils import PreTrainedModel
|
|
40
40
|
from ...utils import auto_docstring, is_flash_attn_2_available, logging
|
|
41
41
|
from ...utils.import_utils import is_triton_available
|
|
@@ -871,6 +871,17 @@ class ModernBertPreTrainedModel(PreTrainedModel):
|
|
|
871
871
|
init.ones_(module.weight)
|
|
872
872
|
if module.bias is not None:
|
|
873
873
|
init.zeros_(module.bias)
|
|
874
|
+
elif isinstance(module, ModernBertRotaryEmbedding):
|
|
875
|
+
for layer_type in module.layer_types:
|
|
876
|
+
rope_init_fn = module.compute_default_rope_parameters
|
|
877
|
+
if module.rope_type[layer_type] != "default":
|
|
878
|
+
rope_init_fn = ROPE_INIT_FUNCTIONS[module.rope_type[layer_type]]
|
|
879
|
+
curr_inv_freq, _ = rope_init_fn(module.config, layer_type=layer_type)
|
|
880
|
+
init.copy_(getattr(module, f"{layer_type}_inv_freq"), curr_inv_freq)
|
|
881
|
+
init.copy_(getattr(module, f"{layer_type}_original_inv_freq"), curr_inv_freq)
|
|
882
|
+
elif isinstance(module, ModernBertUnpaddedRotaryEmbedding):
|
|
883
|
+
inv_freq = module._compute_inv_freq()
|
|
884
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
874
885
|
|
|
875
886
|
def _check_and_adjust_attn_implementation(
|
|
876
887
|
self, attn_implementation: Optional[str], is_init_check: bool = False
|
|
@@ -119,7 +119,7 @@ class ModernBertDecoderRotaryEmbedding(nn.Module):
|
|
|
119
119
|
rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type[layer_type]]
|
|
120
120
|
curr_inv_freq, curr_attention_scaling = rope_init_fn(self.config, device, layer_type=layer_type)
|
|
121
121
|
self.register_buffer(f"{layer_type}_inv_freq", curr_inv_freq, persistent=False)
|
|
122
|
-
|
|
122
|
+
self.register_buffer(f"{layer_type}_original_inv_freq", curr_inv_freq.clone(), persistent=False)
|
|
123
123
|
setattr(self, f"{layer_type}_attention_scaling", curr_attention_scaling)
|
|
124
124
|
|
|
125
125
|
@staticmethod
|
|
@@ -443,6 +443,14 @@ class ModernBertDecoderPreTrainedModel(PreTrainedModel):
|
|
|
443
443
|
init.ones_(module.weight)
|
|
444
444
|
if module.bias is not None:
|
|
445
445
|
init.zeros_(module.bias)
|
|
446
|
+
elif isinstance(module, ModernBertDecoderRotaryEmbedding):
|
|
447
|
+
for layer_type in module.layer_types:
|
|
448
|
+
rope_init_fn = module.compute_default_rope_parameters
|
|
449
|
+
if module.rope_type[layer_type] != "default":
|
|
450
|
+
rope_init_fn = ROPE_INIT_FUNCTIONS[module.rope_type[layer_type]]
|
|
451
|
+
curr_inv_freq, _ = rope_init_fn(module.config, layer_type=layer_type)
|
|
452
|
+
init.copy_(getattr(module, f"{layer_type}_inv_freq"), curr_inv_freq)
|
|
453
|
+
init.copy_(getattr(module, f"{layer_type}_original_inv_freq"), curr_inv_freq)
|
|
446
454
|
|
|
447
455
|
|
|
448
456
|
@auto_docstring
|
|
@@ -28,7 +28,7 @@ from ...generation import GenerationMixin
|
|
|
28
28
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
29
29
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
30
30
|
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
|
|
31
|
-
from ...modeling_rope_utils import RopeParameters
|
|
31
|
+
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, RopeParameters
|
|
32
32
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
|
33
33
|
from ...processing_utils import Unpack
|
|
34
34
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
|
|
@@ -482,6 +482,14 @@ class ModernBertDecoderPreTrainedModel(ModernBertPreTrainedModel):
|
|
|
482
482
|
init.ones_(module.weight)
|
|
483
483
|
if module.bias is not None:
|
|
484
484
|
init.zeros_(module.bias)
|
|
485
|
+
elif isinstance(module, ModernBertDecoderRotaryEmbedding):
|
|
486
|
+
for layer_type in module.layer_types:
|
|
487
|
+
rope_init_fn = module.compute_default_rope_parameters
|
|
488
|
+
if module.rope_type[layer_type] != "default":
|
|
489
|
+
rope_init_fn = ROPE_INIT_FUNCTIONS[module.rope_type[layer_type]]
|
|
490
|
+
curr_inv_freq, _ = rope_init_fn(module.config, layer_type=layer_type)
|
|
491
|
+
init.copy_(getattr(module, f"{layer_type}_inv_freq"), curr_inv_freq)
|
|
492
|
+
init.copy_(getattr(module, f"{layer_type}_original_inv_freq"), curr_inv_freq)
|
|
485
493
|
|
|
486
494
|
def _check_and_adjust_attn_implementation(self, attn_implementation, is_init_check):
|
|
487
495
|
raise AttributeError("No need to inherit!")
|
|
@@ -98,7 +98,7 @@ class MoonshineRotaryEmbedding(nn.Module):
|
|
|
98
98
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
99
99
|
|
|
100
100
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
101
|
-
self.original_inv_freq =
|
|
101
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
102
102
|
|
|
103
103
|
@staticmethod
|
|
104
104
|
def compute_default_rope_parameters(
|
|
@@ -289,7 +289,7 @@ class MoshiRotaryEmbedding(nn.Module):
|
|
|
289
289
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
290
290
|
|
|
291
291
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
292
|
-
self.original_inv_freq =
|
|
292
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
293
293
|
|
|
294
294
|
@staticmethod
|
|
295
295
|
def compute_default_rope_parameters(
|
|
@@ -609,8 +609,8 @@ class MoshiFlashAttention2(MoshiAttention):
|
|
|
609
609
|
else torch.get_autocast_gpu_dtype()
|
|
610
610
|
)
|
|
611
611
|
# Handle the case where the model is quantized
|
|
612
|
-
elif hasattr(self.config, "
|
|
613
|
-
target_dtype = self.config.
|
|
612
|
+
elif hasattr(self.config, "quantization_config"):
|
|
613
|
+
target_dtype = self.config.dtype
|
|
614
614
|
else:
|
|
615
615
|
target_dtype = self.q_proj.weight.dtype
|
|
616
616
|
|
|
@@ -869,6 +869,8 @@ class MoshiDepthDecoder(MoshiPreTrainedModel, GenerationMixin):
|
|
|
869
869
|
self.gradient_checkpointing = False
|
|
870
870
|
self.config = config
|
|
871
871
|
|
|
872
|
+
self.post_init()
|
|
873
|
+
|
|
872
874
|
def forward(
|
|
873
875
|
self,
|
|
874
876
|
input_ids: Optional[torch.LongTensor] = None,
|
|
@@ -2178,6 +2180,7 @@ class MoshiForConditionalGeneration(MoshiPreTrainedModel, GenerationMixin):
|
|
|
2178
2180
|
user_delay_pattern_mask=None,
|
|
2179
2181
|
moshi_delay_pattern_mask=None,
|
|
2180
2182
|
kwargs_depth_decoder=None,
|
|
2183
|
+
is_first_iteration=False,
|
|
2181
2184
|
blank_user_audio_codes: Optional[torch.FloatTensor] = None,
|
|
2182
2185
|
**kwargs,
|
|
2183
2186
|
):
|
|
@@ -2189,49 +2192,21 @@ class MoshiForConditionalGeneration(MoshiPreTrainedModel, GenerationMixin):
|
|
|
2189
2192
|
# Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
|
|
2190
2193
|
# (we can't check exception 3 while compiling)
|
|
2191
2194
|
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2197
|
-
|
|
2198
|
-
|
|
2199
|
-
|
|
2200
|
-
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2204
|
-
|
|
2205
|
-
|
|
2206
|
-
|
|
2207
|
-
if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
|
|
2208
|
-
if model_inputs["inputs_embeds"] is not None:
|
|
2209
|
-
batch_size, sequence_length, _ = inputs_embeds.shape
|
|
2210
|
-
device = inputs_embeds.device
|
|
2211
|
-
else:
|
|
2212
|
-
batch_size, sequence_length = input_ids.shape
|
|
2213
|
-
device = input_ids.device
|
|
2214
|
-
|
|
2215
|
-
attention_mask = self.decoder.model._prepare_4d_causal_attention_mask_with_cache_position(
|
|
2216
|
-
attention_mask,
|
|
2217
|
-
sequence_length=sequence_length,
|
|
2218
|
-
target_length=past_key_values.get_max_cache_shape(),
|
|
2219
|
-
dtype=self.decoder.lm_head.weight.dtype,
|
|
2220
|
-
device=device,
|
|
2221
|
-
cache_position=cache_position,
|
|
2222
|
-
batch_size=batch_size,
|
|
2223
|
-
config=self.config,
|
|
2224
|
-
past_key_values=past_key_values,
|
|
2225
|
-
)
|
|
2226
|
-
|
|
2227
|
-
model_inputs.update(
|
|
2228
|
-
{
|
|
2229
|
-
"position_ids": position_ids,
|
|
2230
|
-
"past_key_values": past_key_values,
|
|
2231
|
-
"use_cache": use_cache,
|
|
2232
|
-
"attention_mask": attention_mask,
|
|
2233
|
-
"cache_position": cache_position,
|
|
2234
|
-
}
|
|
2195
|
+
model_inputs = super().prepare_inputs_for_generation(
|
|
2196
|
+
input_ids,
|
|
2197
|
+
past_key_values=past_key_values,
|
|
2198
|
+
attention_mask=attention_mask,
|
|
2199
|
+
inputs_embeds=inputs_embeds,
|
|
2200
|
+
cache_position=cache_position,
|
|
2201
|
+
position_ids=position_ids,
|
|
2202
|
+
use_cache=use_cache,
|
|
2203
|
+
logits_to_keep=logits_to_keep,
|
|
2204
|
+
user_delay_pattern_mask=user_delay_pattern_mask,
|
|
2205
|
+
moshi_delay_pattern_mask=moshi_delay_pattern_mask,
|
|
2206
|
+
kwargs_depth_decoder=kwargs_depth_decoder,
|
|
2207
|
+
is_first_iteration=is_first_iteration,
|
|
2208
|
+
blank_user_audio_codes=blank_user_audio_codes,
|
|
2209
|
+
**kwargs,
|
|
2235
2210
|
)
|
|
2236
2211
|
|
|
2237
2212
|
# 2. Now that everything is prepared, generate audio_codes using the depth decoder
|
|
@@ -2270,11 +2245,6 @@ class MoshiForConditionalGeneration(MoshiPreTrainedModel, GenerationMixin):
|
|
|
2270
2245
|
model_inputs["input_ids"] = None
|
|
2271
2246
|
model_inputs["inputs_embeds"] = inputs_embeds
|
|
2272
2247
|
|
|
2273
|
-
# Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
|
|
2274
|
-
for key, value in kwargs.items():
|
|
2275
|
-
if key not in model_inputs:
|
|
2276
|
-
model_inputs[key] = value
|
|
2277
|
-
|
|
2278
2248
|
return model_inputs
|
|
2279
2249
|
|
|
2280
2250
|
def _update_model_kwargs_for_generation(
|
|
@@ -52,6 +52,8 @@ class MPNetPreTrainedModel(PreTrainedModel):
|
|
|
52
52
|
super()._init_weights(module)
|
|
53
53
|
if isinstance(module, MPNetLMHead):
|
|
54
54
|
init.zeros_(module.bias)
|
|
55
|
+
elif isinstance(module, MPNetEmbeddings):
|
|
56
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
55
57
|
|
|
56
58
|
|
|
57
59
|
class MPNetEmbeddings(nn.Module):
|
|
@@ -54,7 +54,7 @@ def load_cuda_kernels():
|
|
|
54
54
|
global mra_cuda_kernel
|
|
55
55
|
if not is_kernels_available():
|
|
56
56
|
raise ImportError("kernels is not installed, please install it with `pip install kernels`")
|
|
57
|
-
from
|
|
57
|
+
from ...integrations.hub_kernels import get_kernel
|
|
58
58
|
|
|
59
59
|
mra_cuda_kernel = get_kernel("kernels-community/mra")
|
|
60
60
|
|
|
@@ -796,6 +796,9 @@ class MraPreTrainedModel(PreTrainedModel):
|
|
|
796
796
|
super()._init_weights(module)
|
|
797
797
|
if isinstance(module, MraLMPredictionHead):
|
|
798
798
|
init.zeros_(module.bias)
|
|
799
|
+
elif isinstance(module, MraEmbeddings):
|
|
800
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)) + 2)
|
|
801
|
+
init.zeros_(module.token_type_ids)
|
|
799
802
|
|
|
800
803
|
|
|
801
804
|
@auto_docstring
|
|
@@ -133,17 +133,16 @@ class MT5Config(PreTrainedConfig):
|
|
|
133
133
|
if feed_forward_proj == "gated-gelu":
|
|
134
134
|
self.dense_act_fn = "gelu_new"
|
|
135
135
|
|
|
136
|
+
# Force because official weights have False serialized, but we have to tie always
|
|
137
|
+
kwargs["tie_word_embeddings"] = True
|
|
136
138
|
super().__init__(
|
|
137
139
|
is_encoder_decoder=is_encoder_decoder,
|
|
138
140
|
tokenizer_class=tokenizer_class,
|
|
139
|
-
tie_word_embeddings=tie_word_embeddings,
|
|
140
141
|
pad_token_id=pad_token_id,
|
|
141
142
|
eos_token_id=eos_token_id,
|
|
142
143
|
decoder_start_token_id=decoder_start_token_id,
|
|
143
144
|
**kwargs,
|
|
144
145
|
)
|
|
145
|
-
# TODO: Mt5 never supported not tying encoder decoder so this has to be true.
|
|
146
|
-
self.tie_encoder_decoder = True
|
|
147
146
|
|
|
148
147
|
|
|
149
148
|
__all__ = ["MT5Config"]
|
|
@@ -860,12 +860,10 @@ class MT5Model(MT5PreTrainedModel):
|
|
|
860
860
|
encoder_config = copy.deepcopy(config)
|
|
861
861
|
encoder_config.is_decoder = False
|
|
862
862
|
encoder_config.use_cache = False
|
|
863
|
-
encoder_config.tie_encoder_decoder = False
|
|
864
863
|
self.encoder = MT5Stack(encoder_config)
|
|
865
864
|
|
|
866
865
|
decoder_config = copy.deepcopy(config)
|
|
867
866
|
decoder_config.is_decoder = True
|
|
868
|
-
decoder_config.tie_encoder_decoder = False
|
|
869
867
|
decoder_config.num_layers = config.num_decoder_layers
|
|
870
868
|
self.decoder = MT5Stack(decoder_config)
|
|
871
869
|
|
|
@@ -1043,12 +1041,10 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
|
|
|
1043
1041
|
encoder_config = copy.deepcopy(config)
|
|
1044
1042
|
encoder_config.is_decoder = False
|
|
1045
1043
|
encoder_config.use_cache = False
|
|
1046
|
-
encoder_config.tie_encoder_decoder = False
|
|
1047
1044
|
self.encoder = MT5Stack(encoder_config)
|
|
1048
1045
|
|
|
1049
1046
|
decoder_config = copy.deepcopy(config)
|
|
1050
1047
|
decoder_config.is_decoder = True
|
|
1051
|
-
decoder_config.tie_encoder_decoder = False
|
|
1052
1048
|
decoder_config.num_layers = config.num_decoder_layers
|
|
1053
1049
|
self.decoder = MT5Stack(decoder_config)
|
|
1054
1050
|
|
|
@@ -1066,7 +1062,6 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
|
|
|
1066
1062
|
self.decoder.set_input_embeddings(new_embeddings)
|
|
1067
1063
|
|
|
1068
1064
|
@auto_docstring
|
|
1069
|
-
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward with google-t5/->google/, T5->MT5, t5->mt5
|
|
1070
1065
|
def forward(
|
|
1071
1066
|
self,
|
|
1072
1067
|
input_ids: Optional[torch.LongTensor] = None,
|
|
@@ -1184,9 +1179,6 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
|
|
|
1184
1179
|
|
|
1185
1180
|
sequence_output = decoder_outputs[0]
|
|
1186
1181
|
|
|
1187
|
-
if self.config.tie_word_embeddings:
|
|
1188
|
-
sequence_output = sequence_output * (self.model_dim**-0.5)
|
|
1189
|
-
|
|
1190
1182
|
lm_logits = self.lm_head(sequence_output)
|
|
1191
1183
|
|
|
1192
1184
|
loss = None
|
|
@@ -1551,12 +1543,10 @@ class MT5ForQuestionAnswering(MT5PreTrainedModel):
|
|
|
1551
1543
|
encoder_config = copy.deepcopy(config)
|
|
1552
1544
|
encoder_config.is_decoder = False
|
|
1553
1545
|
encoder_config.use_cache = False
|
|
1554
|
-
encoder_config.tie_encoder_decoder = False
|
|
1555
1546
|
self.encoder = MT5Stack(encoder_config)
|
|
1556
1547
|
|
|
1557
1548
|
decoder_config = copy.deepcopy(config)
|
|
1558
1549
|
decoder_config.is_decoder = True
|
|
1559
|
-
decoder_config.tie_encoder_decoder = False
|
|
1560
1550
|
decoder_config.num_layers = config.num_decoder_layers
|
|
1561
1551
|
self.decoder = MT5Stack(decoder_config)
|
|
1562
1552
|
|
|
@@ -117,6 +117,7 @@ class MusicgenSinusoidalPositionalEmbedding(nn.Module):
|
|
|
117
117
|
def __init__(self, num_positions: int, embedding_dim: int):
|
|
118
118
|
super().__init__()
|
|
119
119
|
self.embedding_dim = embedding_dim
|
|
120
|
+
self.num_positions = num_positions
|
|
120
121
|
self.make_weights(num_positions, embedding_dim)
|
|
121
122
|
|
|
122
123
|
def make_weights(self, num_embeddings: int, embedding_dim: int):
|
|
@@ -432,6 +433,9 @@ class MusicgenPreTrainedModel(PreTrainedModel):
|
|
|
432
433
|
# Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
|
|
433
434
|
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
|
|
434
435
|
init.zeros_(module.weight[module.padding_idx])
|
|
436
|
+
elif isinstance(module, MusicgenSinusoidalPositionalEmbedding):
|
|
437
|
+
emb_weights = module.get_embedding(module.num_positions, module.embedding_dim)
|
|
438
|
+
init.copy_(module.weights, emb_weights)
|
|
435
439
|
|
|
436
440
|
|
|
437
441
|
class MusicgenDecoder(MusicgenPreTrainedModel):
|
|
@@ -2082,7 +2086,6 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
|
|
|
2082
2086
|
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
|
2083
2087
|
synced_gpus: Optional[bool] = None,
|
|
2084
2088
|
streamer: Optional["BaseStreamer"] = None,
|
|
2085
|
-
use_model_defaults: Optional[bool] = None,
|
|
2086
2089
|
**kwargs,
|
|
2087
2090
|
):
|
|
2088
2091
|
"""
|
|
@@ -2127,11 +2130,6 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
|
|
|
2127
2130
|
streamer (`BaseStreamer`, *optional*):
|
|
2128
2131
|
Streamer object that will be used to stream the generated sequences. Generated tokens are passed
|
|
2129
2132
|
through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
|
|
2130
|
-
use_model_defaults (`bool`, *optional*):
|
|
2131
|
-
When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
|
|
2132
|
-
generation configuration (`model.generation_config`), as opposed to the global defaults
|
|
2133
|
-
(`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
|
|
2134
|
-
`True`.
|
|
2135
2133
|
kwargs (`dict[str, Any]`, *optional*):
|
|
2136
2134
|
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
|
|
2137
2135
|
forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
|
|
@@ -2155,9 +2153,7 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
|
|
|
2155
2153
|
"""
|
|
2156
2154
|
# 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
|
|
2157
2155
|
generation_mode_kwargs = self._extract_generation_mode_kwargs(None, kwargs, False, None, None)
|
|
2158
|
-
generation_config, model_kwargs = self._prepare_generation_config(
|
|
2159
|
-
generation_config, use_model_defaults, **kwargs
|
|
2160
|
-
)
|
|
2156
|
+
generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
|
|
2161
2157
|
generation_mode = generation_config.get_generation_mode()
|
|
2162
2158
|
if generation_mode not in [GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH]:
|
|
2163
2159
|
raise ValueError(
|
|
@@ -122,6 +122,7 @@ class MusicgenMelodySinusoidalPositionalEmbedding(nn.Module):
|
|
|
122
122
|
def __init__(self, num_positions: int, embedding_dim: int):
|
|
123
123
|
super().__init__()
|
|
124
124
|
self.embedding_dim = embedding_dim
|
|
125
|
+
self.num_positions = num_positions
|
|
125
126
|
self.make_weights(num_positions, embedding_dim)
|
|
126
127
|
|
|
127
128
|
def make_weights(self, num_embeddings: int, embedding_dim: int):
|
|
@@ -403,6 +404,9 @@ class MusicgenMelodyPreTrainedModel(PreTrainedModel):
|
|
|
403
404
|
# Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
|
|
404
405
|
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
|
|
405
406
|
init.zeros_(module.weight[module.padding_idx])
|
|
407
|
+
elif isinstance(module, MusicgenMelodySinusoidalPositionalEmbedding):
|
|
408
|
+
emb_weights = module.get_embedding(module.num_positions, module.embedding_dim)
|
|
409
|
+
init.copy_(module.weights, emb_weights)
|
|
406
410
|
|
|
407
411
|
|
|
408
412
|
# Copied from transformers.models.musicgen.modeling_musicgen.MusicgenDecoder with MUSICGEN->MUSICGEN_MELODY,Musicgen->MusicgenMelody
|
|
@@ -21,6 +21,7 @@ import torch
|
|
|
21
21
|
from torch import nn
|
|
22
22
|
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
23
23
|
|
|
24
|
+
from ... import initialization as init
|
|
24
25
|
from ...activations import ACT2FN
|
|
25
26
|
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
|
26
27
|
from ...generation import GenerationMixin
|
|
@@ -469,6 +470,11 @@ class MvpPreTrainedModel(PreTrainedModel):
|
|
|
469
470
|
base_model_prefix = "model"
|
|
470
471
|
supports_gradient_checkpointing = True
|
|
471
472
|
|
|
473
|
+
def _init_weights(self, module):
|
|
474
|
+
super()._init_weights(module)
|
|
475
|
+
if isinstance(module, MvpForConditionalGeneration):
|
|
476
|
+
init.zeros_(module.final_logits_bias)
|
|
477
|
+
|
|
472
478
|
@property
|
|
473
479
|
def dummy_inputs(self):
|
|
474
480
|
pad_token = self.config.pad_token_id
|
|
@@ -1509,6 +1515,7 @@ class MvpDecoderWrapper(MvpPreTrainedModel):
|
|
|
1509
1515
|
def __init__(self, config):
|
|
1510
1516
|
super().__init__(config)
|
|
1511
1517
|
self.decoder = MvpDecoder(config)
|
|
1518
|
+
self.post_init()
|
|
1512
1519
|
|
|
1513
1520
|
def forward(self, *args, **kwargs):
|
|
1514
1521
|
return self.decoder(*args, **kwargs)
|
|
@@ -74,7 +74,7 @@ class NanoChatRotaryEmbedding(nn.Module):
|
|
|
74
74
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
75
75
|
|
|
76
76
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
77
|
-
self.original_inv_freq =
|
|
77
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
78
78
|
|
|
79
79
|
@staticmethod
|
|
80
80
|
def compute_default_rope_parameters(
|
|
@@ -110,7 +110,7 @@ class NemotronRotaryEmbedding(nn.Module):
|
|
|
110
110
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
111
111
|
|
|
112
112
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
113
|
-
self.original_inv_freq =
|
|
113
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
114
114
|
|
|
115
115
|
@staticmethod
|
|
116
116
|
# Ignore copy
|
|
@@ -397,8 +397,8 @@ class NemotronFlashAttention2(NemotronAttention):
|
|
|
397
397
|
else torch.get_autocast_gpu_dtype()
|
|
398
398
|
)
|
|
399
399
|
# Handle the case where the model is quantized
|
|
400
|
-
elif hasattr(self.config, "
|
|
401
|
-
target_dtype = self.config.
|
|
400
|
+
elif hasattr(self.config, "quantization_config"):
|
|
401
|
+
target_dtype = self.config.dtype
|
|
402
402
|
else:
|
|
403
403
|
target_dtype = self.q_proj.weight.dtype
|
|
404
404
|
|
|
@@ -206,6 +206,7 @@ class NllbMoeConfig(PreTrainedConfig):
|
|
|
206
206
|
self.moe_eval_capacity_token_fraction = moe_eval_capacity_token_fraction
|
|
207
207
|
self.moe_token_dropout = moe_token_dropout
|
|
208
208
|
self.output_router_logits = output_router_logits
|
|
209
|
+
|
|
209
210
|
super().__init__(
|
|
210
211
|
pad_token_id=pad_token_id,
|
|
211
212
|
bos_token_id=bos_token_id,
|
|
@@ -21,6 +21,7 @@ import torch
|
|
|
21
21
|
import torch.nn as nn
|
|
22
22
|
from torch.nn import CrossEntropyLoss
|
|
23
23
|
|
|
24
|
+
from ... import initialization as init
|
|
24
25
|
from ...activations import ACT2FN
|
|
25
26
|
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
|
26
27
|
from ...generation import GenerationMixin
|
|
@@ -66,6 +67,7 @@ class NllbMoeSinusoidalPositionalEmbedding(nn.Module):
|
|
|
66
67
|
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
|
|
67
68
|
super().__init__()
|
|
68
69
|
self.offset = 2
|
|
70
|
+
self.num_positions = num_positions
|
|
69
71
|
self.embedding_dim = embedding_dim
|
|
70
72
|
self.padding_idx = padding_idx
|
|
71
73
|
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
|
|
@@ -665,6 +667,14 @@ class NllbMoePreTrainedModel(PreTrainedModel):
|
|
|
665
667
|
_supports_sdpa = False
|
|
666
668
|
_supports_flex_attn = False
|
|
667
669
|
|
|
670
|
+
def _init_weights(self, module):
|
|
671
|
+
super()._init_weights(module)
|
|
672
|
+
if isinstance(module, NllbMoeSinusoidalPositionalEmbedding):
|
|
673
|
+
emb_weights = module.get_embedding(
|
|
674
|
+
module.num_positions + module.offset, module.embedding_dim, module.padding_idx
|
|
675
|
+
)
|
|
676
|
+
init.copy_(module.weights, emb_weights)
|
|
677
|
+
|
|
668
678
|
|
|
669
679
|
class NllbMoeEncoder(NllbMoePreTrainedModel):
|
|
670
680
|
_can_record_outputs = {
|
|
@@ -290,7 +290,6 @@ class NougatImageProcessorFast(BaseImageProcessorFast):
|
|
|
290
290
|
processed_images_grouped[shape] = stacked_images
|
|
291
291
|
|
|
292
292
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
293
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
294
293
|
|
|
295
294
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
296
295
|
|
|
@@ -441,31 +441,26 @@ class NougatTokenizer(TokenizersBackend):
|
|
|
441
441
|
)
|
|
442
442
|
self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)
|
|
443
443
|
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
444
|
+
super().__init__(
|
|
445
|
+
errors=errors,
|
|
446
|
+
unk_token=unk_token,
|
|
447
|
+
bos_token=bos_token,
|
|
448
|
+
eos_token=eos_token,
|
|
449
|
+
pad_token=pad_token,
|
|
450
|
+
**kwargs,
|
|
451
|
+
)
|
|
448
452
|
self._tokenizer.post_processor = processors.TemplateProcessing(
|
|
449
453
|
single=f"{bos_token}:0 $A:0 {eos_token}:0",
|
|
450
454
|
pair="$A:0 $B:1",
|
|
451
455
|
special_tokens=[
|
|
452
|
-
(str(eos_token), eos_token_id),
|
|
453
|
-
(str(bos_token), bos_token_id),
|
|
456
|
+
(str(eos_token), self.eos_token_id),
|
|
457
|
+
(str(bos_token), self.bos_token_id),
|
|
454
458
|
],
|
|
455
459
|
)
|
|
456
460
|
|
|
457
461
|
# Enable truncation and padding
|
|
458
462
|
self._tokenizer.enable_truncation(max_length=4096)
|
|
459
|
-
self._tokenizer.enable_padding(length=4096, pad_id=pad_token_id, pad_token=str(pad_token))
|
|
460
|
-
|
|
461
|
-
super().__init__(
|
|
462
|
-
errors=errors,
|
|
463
|
-
unk_token=unk_token,
|
|
464
|
-
bos_token=bos_token,
|
|
465
|
-
eos_token=eos_token,
|
|
466
|
-
pad_token=pad_token,
|
|
467
|
-
**kwargs,
|
|
468
|
-
)
|
|
463
|
+
self._tokenizer.enable_padding(length=4096, pad_id=self.pad_token_id, pad_token=str(pad_token))
|
|
469
464
|
|
|
470
465
|
def remove_hallucinated_references(self, text: str) -> str:
|
|
471
466
|
"""
|