PyPI - transformers - Versions diffs - 5.0.0__py3-none-any.whl → 5.0.0rc0__py3-none-any.whl - Mend

transformers 5.0.0py3-none-any.whl → 5.0.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1606) hide show

transformers/__init__.py +36 -55
transformers/activations.py +1 -1
transformers/audio_utils.py +33 -32
transformers/cache_utils.py +139 -32
transformers/cli/chat.py +3 -3
transformers/cli/serve.py +19 -49
transformers/cli/transformers.py +1 -2
transformers/configuration_utils.py +155 -129
transformers/conversion_mapping.py +22 -158
transformers/convert_slow_tokenizer.py +17 -227
transformers/core_model_loading.py +185 -528
transformers/data/data_collator.py +4 -12
transformers/data/processors/glue.py +1 -0
transformers/data/processors/utils.py +1 -0
transformers/data/processors/xnli.py +1 -0
transformers/dependency_versions_check.py +1 -0
transformers/dependency_versions_table.py +7 -5
transformers/distributed/configuration_utils.py +2 -1
transformers/dynamic_module_utils.py +25 -24
transformers/feature_extraction_sequence_utils.py +23 -19
transformers/feature_extraction_utils.py +33 -64
transformers/file_utils.py +1 -0
transformers/generation/__init__.py +1 -11
transformers/generation/candidate_generator.py +33 -80
transformers/generation/configuration_utils.py +133 -189
transformers/generation/continuous_batching/__init__.py +1 -4
transformers/generation/continuous_batching/cache.py +25 -83
transformers/generation/continuous_batching/cache_manager.py +45 -155
transformers/generation/continuous_batching/continuous_api.py +147 -270
transformers/generation/continuous_batching/requests.py +3 -51
transformers/generation/continuous_batching/scheduler.py +105 -160
transformers/generation/logits_process.py +128 -0
transformers/generation/stopping_criteria.py +1 -1
transformers/generation/streamers.py +1 -0
transformers/generation/utils.py +123 -122
transformers/generation/watermarking.py +6 -8
transformers/hf_argparser.py +13 -9
transformers/hyperparameter_search.py +2 -1
transformers/image_processing_base.py +23 -12
transformers/image_processing_utils.py +15 -11
transformers/image_processing_utils_fast.py +75 -85
transformers/image_transforms.py +42 -73
transformers/image_utils.py +32 -30
transformers/initialization.py +0 -37
transformers/integrations/__init__.py +2 -16
transformers/integrations/accelerate.py +113 -58
transformers/integrations/aqlm.py +66 -36
transformers/integrations/awq.py +516 -45
transformers/integrations/bitnet.py +105 -47
transformers/integrations/bitsandbytes.py +202 -91
transformers/integrations/deepspeed.py +4 -161
transformers/integrations/eetq.py +82 -84
transformers/integrations/executorch.py +1 -1
transformers/integrations/fbgemm_fp8.py +145 -190
transformers/integrations/finegrained_fp8.py +215 -249
transformers/integrations/flash_attention.py +3 -3
transformers/integrations/flex_attention.py +1 -1
transformers/integrations/fp_quant.py +0 -90
transformers/integrations/ggml.py +2 -11
transformers/integrations/higgs.py +62 -37
transformers/integrations/hub_kernels.py +8 -65
transformers/integrations/integration_utils.py +3 -47
transformers/integrations/mistral.py +0 -12
transformers/integrations/mxfp4.py +80 -33
transformers/integrations/peft.py +191 -483
transformers/integrations/quanto.py +56 -77
transformers/integrations/spqr.py +90 -42
transformers/integrations/tensor_parallel.py +221 -167
transformers/integrations/torchao.py +43 -35
transformers/integrations/vptq.py +59 -40
transformers/kernels/__init__.py +0 -0
transformers/{models/pe_audio_video/processing_pe_audio_video.py → kernels/falcon_mamba/__init__.py} +3 -12
transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +529 -0
transformers/loss/loss_utils.py +0 -2
transformers/masking_utils.py +55 -51
transformers/model_debugging_utils.py +5 -4
transformers/modelcard.py +194 -15
transformers/modeling_attn_mask_utils.py +19 -19
transformers/modeling_flash_attention_utils.py +27 -27
transformers/modeling_gguf_pytorch_utils.py +24 -79
transformers/modeling_layers.py +22 -21
transformers/modeling_outputs.py +253 -242
transformers/modeling_rope_utils.py +117 -138
transformers/modeling_utils.py +739 -850
transformers/models/__init__.py +0 -27
transformers/models/afmoe/configuration_afmoe.py +33 -40
transformers/models/afmoe/modeling_afmoe.py +54 -42
transformers/models/afmoe/modular_afmoe.py +33 -23
transformers/models/aimv2/configuration_aimv2.py +10 -2
transformers/models/aimv2/modeling_aimv2.py +42 -47
transformers/models/aimv2/modular_aimv2.py +19 -17
transformers/models/albert/configuration_albert.py +2 -8
transformers/models/albert/modeling_albert.py +69 -70
transformers/models/albert/tokenization_albert.py +14 -5
transformers/models/align/configuration_align.py +6 -8
transformers/models/align/modeling_align.py +89 -94
transformers/models/align/processing_align.py +30 -2
transformers/models/altclip/configuration_altclip.py +7 -4
transformers/models/altclip/modeling_altclip.py +103 -114
transformers/models/altclip/processing_altclip.py +15 -2
transformers/models/apertus/__init__.py +1 -0
transformers/models/apertus/configuration_apertus.py +28 -23
transformers/models/apertus/modeling_apertus.py +40 -39
transformers/models/apertus/modular_apertus.py +38 -37
transformers/models/arcee/configuration_arcee.py +30 -25
transformers/models/arcee/modeling_arcee.py +39 -36
transformers/models/arcee/modular_arcee.py +23 -20
transformers/models/aria/configuration_aria.py +44 -31
transformers/models/aria/image_processing_aria.py +27 -25
transformers/models/aria/modeling_aria.py +106 -110
transformers/models/aria/modular_aria.py +127 -118
transformers/models/aria/processing_aria.py +35 -28
transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +1 -0
transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +6 -3
transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +8 -6
transformers/models/audioflamingo3/__init__.py +1 -0
transformers/models/audioflamingo3/configuration_audioflamingo3.py +1 -0
transformers/models/audioflamingo3/modeling_audioflamingo3.py +49 -58
transformers/models/audioflamingo3/modular_audioflamingo3.py +43 -53
transformers/models/audioflamingo3/processing_audioflamingo3.py +30 -33
transformers/models/auto/auto_factory.py +7 -6
transformers/models/auto/configuration_auto.py +5 -66
transformers/models/auto/feature_extraction_auto.py +10 -14
transformers/models/auto/image_processing_auto.py +41 -32
transformers/models/auto/modeling_auto.py +188 -46
transformers/models/auto/processing_auto.py +11 -24
transformers/models/auto/tokenization_auto.py +588 -171
transformers/models/auto/video_processing_auto.py +10 -12
transformers/models/autoformer/configuration_autoformer.py +7 -4
transformers/models/autoformer/modeling_autoformer.py +101 -104
transformers/models/aya_vision/configuration_aya_vision.py +1 -4
transformers/models/aya_vision/modeling_aya_vision.py +102 -71
transformers/models/aya_vision/modular_aya_vision.py +74 -46
transformers/models/aya_vision/processing_aya_vision.py +53 -25
transformers/models/bamba/configuration_bamba.py +39 -34
transformers/models/bamba/modeling_bamba.py +86 -82
transformers/models/bamba/modular_bamba.py +72 -70
transformers/models/bark/configuration_bark.py +8 -6
transformers/models/bark/generation_configuration_bark.py +5 -3
transformers/models/bark/modeling_bark.py +57 -54
transformers/models/bark/processing_bark.py +41 -19
transformers/models/bart/configuration_bart.py +6 -9
transformers/models/bart/modeling_bart.py +126 -135
transformers/models/barthez/tokenization_barthez.py +11 -3
transformers/models/bartpho/tokenization_bartpho.py +7 -6
transformers/models/beit/configuration_beit.py +11 -0
transformers/models/beit/image_processing_beit.py +56 -53
transformers/models/beit/image_processing_beit_fast.py +12 -10
transformers/models/beit/modeling_beit.py +60 -69
transformers/models/bert/configuration_bert.py +2 -12
transformers/models/bert/modeling_bert.py +122 -114
transformers/models/bert/tokenization_bert.py +23 -8
transformers/models/bert/tokenization_bert_legacy.py +5 -3
transformers/models/bert_generation/configuration_bert_generation.py +2 -17
transformers/models/bert_generation/modeling_bert_generation.py +49 -49
transformers/models/bert_generation/tokenization_bert_generation.py +3 -2
transformers/models/bert_japanese/tokenization_bert_japanese.py +6 -5
transformers/models/bertweet/tokenization_bertweet.py +3 -1
transformers/models/big_bird/configuration_big_bird.py +9 -12
transformers/models/big_bird/modeling_big_bird.py +109 -116
transformers/models/big_bird/tokenization_big_bird.py +43 -16
transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -9
transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +117 -130
transformers/models/biogpt/configuration_biogpt.py +2 -8
transformers/models/biogpt/modeling_biogpt.py +76 -72
transformers/models/biogpt/modular_biogpt.py +66 -62
transformers/models/biogpt/tokenization_biogpt.py +5 -3
transformers/models/bit/configuration_bit.py +1 -0
transformers/models/bit/image_processing_bit.py +24 -21
transformers/models/bit/image_processing_bit_fast.py +1 -0
transformers/models/bit/modeling_bit.py +12 -25
transformers/models/bitnet/configuration_bitnet.py +28 -23
transformers/models/bitnet/modeling_bitnet.py +39 -36
transformers/models/bitnet/modular_bitnet.py +6 -4
transformers/models/blenderbot/configuration_blenderbot.py +5 -8
transformers/models/blenderbot/modeling_blenderbot.py +96 -77
transformers/models/blenderbot/tokenization_blenderbot.py +24 -18
transformers/models/blenderbot_small/configuration_blenderbot_small.py +5 -8
transformers/models/blenderbot_small/modeling_blenderbot_small.py +69 -79
transformers/models/blenderbot_small/tokenization_blenderbot_small.py +3 -1
transformers/models/blip/configuration_blip.py +10 -9
transformers/models/blip/image_processing_blip.py +20 -17
transformers/models/blip/image_processing_blip_fast.py +1 -0
transformers/models/blip/modeling_blip.py +108 -117
transformers/models/blip/modeling_blip_text.py +65 -73
transformers/models/blip/processing_blip.py +36 -5
transformers/models/blip_2/configuration_blip_2.py +2 -2
transformers/models/blip_2/modeling_blip_2.py +118 -146
transformers/models/blip_2/processing_blip_2.py +38 -8
transformers/models/bloom/configuration_bloom.py +2 -5
transformers/models/bloom/modeling_bloom.py +104 -77
transformers/models/blt/configuration_blt.py +86 -94
transformers/models/blt/modeling_blt.py +81 -238
transformers/models/blt/modular_blt.py +65 -228
transformers/models/bridgetower/configuration_bridgetower.py +2 -7
transformers/models/bridgetower/image_processing_bridgetower.py +35 -34
transformers/models/bridgetower/image_processing_bridgetower_fast.py +16 -13
transformers/models/bridgetower/modeling_bridgetower.py +119 -141
transformers/models/bridgetower/processing_bridgetower.py +16 -2
transformers/models/bros/configuration_bros.py +18 -24
transformers/models/bros/modeling_bros.py +80 -90
transformers/models/bros/processing_bros.py +12 -2
transformers/models/byt5/tokenization_byt5.py +6 -4
transformers/models/camembert/configuration_camembert.py +2 -8
transformers/models/camembert/modeling_camembert.py +195 -196
transformers/models/camembert/modular_camembert.py +54 -51
transformers/models/camembert/tokenization_camembert.py +13 -6
transformers/models/canine/configuration_canine.py +2 -4
transformers/models/canine/modeling_canine.py +75 -84
transformers/models/canine/tokenization_canine.py +1 -2
transformers/models/chameleon/configuration_chameleon.py +34 -29
transformers/models/chameleon/image_processing_chameleon.py +24 -21
transformers/models/chameleon/image_processing_chameleon_fast.py +6 -5
transformers/models/chameleon/modeling_chameleon.py +93 -142
transformers/models/chameleon/processing_chameleon.py +41 -16
transformers/models/chinese_clip/configuration_chinese_clip.py +8 -10
transformers/models/chinese_clip/image_processing_chinese_clip.py +24 -21
transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +1 -0
transformers/models/chinese_clip/modeling_chinese_clip.py +92 -96
transformers/models/chinese_clip/processing_chinese_clip.py +15 -2
transformers/models/clap/configuration_clap.py +9 -4
transformers/models/clap/feature_extraction_clap.py +12 -11
transformers/models/clap/modeling_clap.py +123 -136
transformers/models/clap/processing_clap.py +15 -2
transformers/models/clip/configuration_clip.py +2 -4
transformers/models/clip/image_processing_clip.py +24 -21
transformers/models/clip/image_processing_clip_fast.py +1 -9
transformers/models/clip/modeling_clip.py +65 -65
transformers/models/clip/processing_clip.py +14 -2
transformers/models/clip/tokenization_clip.py +46 -21
transformers/models/clipseg/configuration_clipseg.py +2 -4
transformers/models/clipseg/modeling_clipseg.py +109 -119
transformers/models/clipseg/processing_clipseg.py +42 -19
transformers/models/clvp/configuration_clvp.py +5 -15
transformers/models/clvp/feature_extraction_clvp.py +10 -7
transformers/models/clvp/modeling_clvp.py +146 -155
transformers/models/clvp/number_normalizer.py +2 -1
transformers/models/clvp/processing_clvp.py +20 -3
transformers/models/clvp/tokenization_clvp.py +64 -1
transformers/models/code_llama/tokenization_code_llama.py +44 -18
transformers/models/codegen/configuration_codegen.py +4 -4
transformers/models/codegen/modeling_codegen.py +53 -63
transformers/models/codegen/tokenization_codegen.py +47 -17
transformers/models/cohere/configuration_cohere.py +30 -25
transformers/models/cohere/modeling_cohere.py +42 -40
transformers/models/cohere/modular_cohere.py +29 -26
transformers/models/cohere/tokenization_cohere.py +46 -15
transformers/models/cohere2/configuration_cohere2.py +32 -31
transformers/models/cohere2/modeling_cohere2.py +44 -42
transformers/models/cohere2/modular_cohere2.py +54 -54
transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +14 -13
transformers/models/cohere2_vision/modeling_cohere2_vision.py +58 -59
transformers/models/cohere2_vision/modular_cohere2_vision.py +46 -45
transformers/models/cohere2_vision/processing_cohere2_vision.py +36 -6
transformers/models/colpali/configuration_colpali.py +1 -0
transformers/models/colpali/modeling_colpali.py +16 -14
transformers/models/colpali/modular_colpali.py +51 -11
transformers/models/colpali/processing_colpali.py +52 -14
transformers/models/colqwen2/modeling_colqwen2.py +28 -28
transformers/models/colqwen2/modular_colqwen2.py +74 -37
transformers/models/colqwen2/processing_colqwen2.py +52 -16
transformers/models/conditional_detr/configuration_conditional_detr.py +2 -1
transformers/models/conditional_detr/image_processing_conditional_detr.py +70 -67
transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +36 -36
transformers/models/conditional_detr/modeling_conditional_detr.py +87 -99
transformers/models/conditional_detr/modular_conditional_detr.py +3 -49
transformers/models/convbert/configuration_convbert.py +8 -11
transformers/models/convbert/modeling_convbert.py +87 -94
transformers/models/convbert/tokenization_convbert.py +1 -0
transformers/models/convnext/configuration_convnext.py +1 -0
transformers/models/convnext/image_processing_convnext.py +23 -20
transformers/models/convnext/image_processing_convnext_fast.py +21 -16
transformers/models/convnext/modeling_convnext.py +12 -9
transformers/models/convnextv2/configuration_convnextv2.py +1 -0
transformers/models/convnextv2/modeling_convnextv2.py +12 -9
transformers/models/cpm/tokenization_cpm.py +7 -6
transformers/models/cpm/tokenization_cpm_fast.py +5 -3
transformers/models/cpmant/configuration_cpmant.py +1 -4
transformers/models/cpmant/modeling_cpmant.py +40 -38
transformers/models/cpmant/tokenization_cpmant.py +3 -1
transformers/models/csm/configuration_csm.py +66 -58
transformers/models/csm/generation_csm.py +35 -31
transformers/models/csm/modeling_csm.py +85 -85
transformers/models/csm/modular_csm.py +58 -58
transformers/models/csm/processing_csm.py +68 -25
transformers/models/ctrl/configuration_ctrl.py +1 -16
transformers/models/ctrl/modeling_ctrl.py +44 -54
transformers/models/ctrl/tokenization_ctrl.py +1 -0
transformers/models/cvt/configuration_cvt.py +1 -0
transformers/models/cvt/modeling_cvt.py +16 -20
transformers/models/cwm/__init__.py +1 -0
transformers/models/cwm/configuration_cwm.py +12 -8
transformers/models/cwm/modeling_cwm.py +39 -37
transformers/models/cwm/modular_cwm.py +12 -10
transformers/models/d_fine/configuration_d_fine.py +5 -7
transformers/models/d_fine/modeling_d_fine.py +128 -138
transformers/models/d_fine/modular_d_fine.py +18 -33
transformers/models/dab_detr/configuration_dab_detr.py +3 -6
transformers/models/dab_detr/modeling_dab_detr.py +75 -81
transformers/models/dac/configuration_dac.py +1 -0
transformers/models/dac/feature_extraction_dac.py +9 -6
transformers/models/dac/modeling_dac.py +26 -24
transformers/models/data2vec/configuration_data2vec_audio.py +2 -4
transformers/models/data2vec/configuration_data2vec_text.py +3 -11
transformers/models/data2vec/configuration_data2vec_vision.py +1 -0
transformers/models/data2vec/modeling_data2vec_audio.py +56 -57
transformers/models/data2vec/modeling_data2vec_text.py +93 -98
transformers/models/data2vec/modeling_data2vec_vision.py +45 -49
transformers/models/data2vec/modular_data2vec_audio.py +1 -6
transformers/models/data2vec/modular_data2vec_text.py +54 -58
transformers/models/dbrx/configuration_dbrx.py +22 -36
transformers/models/dbrx/modeling_dbrx.py +45 -42
transformers/models/dbrx/modular_dbrx.py +33 -31
transformers/models/deberta/configuration_deberta.py +1 -6
transformers/models/deberta/modeling_deberta.py +60 -64
transformers/models/deberta/tokenization_deberta.py +21 -9
transformers/models/deberta_v2/configuration_deberta_v2.py +1 -6
transformers/models/deberta_v2/modeling_deberta_v2.py +65 -71
transformers/models/deberta_v2/tokenization_deberta_v2.py +29 -11
transformers/models/decision_transformer/configuration_decision_transformer.py +2 -3
transformers/models/decision_transformer/modeling_decision_transformer.py +56 -60
transformers/models/deepseek_v2/configuration_deepseek_v2.py +44 -39
transformers/models/deepseek_v2/modeling_deepseek_v2.py +43 -43
transformers/models/deepseek_v2/modular_deepseek_v2.py +49 -48
transformers/models/deepseek_v3/configuration_deepseek_v3.py +45 -40
transformers/models/deepseek_v3/modeling_deepseek_v3.py +42 -45
transformers/models/deepseek_v3/modular_deepseek_v3.py +9 -14
transformers/models/deepseek_vl/configuration_deepseek_vl.py +3 -2
transformers/models/deepseek_vl/image_processing_deepseek_vl.py +26 -25
transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +10 -10
transformers/models/deepseek_vl/modeling_deepseek_vl.py +48 -57
transformers/models/deepseek_vl/modular_deepseek_vl.py +43 -14
transformers/models/deepseek_vl/processing_deepseek_vl.py +41 -10
transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +5 -3
transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +35 -35
transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +24 -20
transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +61 -109
transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +118 -146
transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +44 -12
transformers/models/deformable_detr/configuration_deformable_detr.py +3 -2
transformers/models/deformable_detr/image_processing_deformable_detr.py +61 -59
transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +28 -28
transformers/models/deformable_detr/modeling_deformable_detr.py +82 -88
transformers/models/deformable_detr/modular_deformable_detr.py +3 -1
transformers/models/deit/configuration_deit.py +1 -0
transformers/models/deit/image_processing_deit.py +21 -18
transformers/models/deit/image_processing_deit_fast.py +1 -0
transformers/models/deit/modeling_deit.py +22 -24
transformers/models/depth_anything/configuration_depth_anything.py +4 -2
transformers/models/depth_anything/modeling_depth_anything.py +10 -10
transformers/models/depth_pro/configuration_depth_pro.py +1 -0
transformers/models/depth_pro/image_processing_depth_pro.py +23 -22
transformers/models/depth_pro/image_processing_depth_pro_fast.py +10 -8
transformers/models/depth_pro/modeling_depth_pro.py +27 -31
transformers/models/detr/configuration_detr.py +2 -1
transformers/models/detr/image_processing_detr.py +66 -64
transformers/models/detr/image_processing_detr_fast.py +34 -33
transformers/models/detr/modeling_detr.py +79 -95
transformers/models/dia/configuration_dia.py +15 -9
transformers/models/dia/feature_extraction_dia.py +9 -6
transformers/models/dia/generation_dia.py +50 -48
transformers/models/dia/modeling_dia.py +69 -78
transformers/models/dia/modular_dia.py +56 -64
transformers/models/dia/processing_dia.py +29 -39
transformers/models/dia/tokenization_dia.py +6 -3
transformers/models/diffllama/configuration_diffllama.py +30 -25
transformers/models/diffllama/modeling_diffllama.py +49 -46
transformers/models/diffllama/modular_diffllama.py +19 -17
transformers/models/dinat/configuration_dinat.py +1 -0
transformers/models/dinat/modeling_dinat.py +44 -47
transformers/models/dinov2/configuration_dinov2.py +1 -0
transformers/models/dinov2/modeling_dinov2.py +15 -15
transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +1 -1
transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +15 -16
transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +9 -9
transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +7 -4
transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +6 -3
transformers/models/dinov3_vit/configuration_dinov3_vit.py +8 -5
transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +9 -7
transformers/models/dinov3_vit/modeling_dinov3_vit.py +18 -19
transformers/models/dinov3_vit/modular_dinov3_vit.py +15 -16
transformers/models/distilbert/configuration_distilbert.py +2 -8
transformers/models/distilbert/modeling_distilbert.py +55 -55
transformers/models/distilbert/tokenization_distilbert.py +1 -13
transformers/models/doge/__init__.py +1 -0
transformers/models/doge/configuration_doge.py +32 -39
transformers/models/doge/modeling_doge.py +49 -45
transformers/models/doge/modular_doge.py +63 -71
transformers/models/donut/configuration_donut_swin.py +1 -0
transformers/models/donut/image_processing_donut.py +29 -26
transformers/models/donut/image_processing_donut_fast.py +15 -9
transformers/models/donut/modeling_donut_swin.py +58 -62
transformers/models/donut/processing_donut.py +26 -5
transformers/models/dots1/configuration_dots1.py +33 -41
transformers/models/dots1/modeling_dots1.py +45 -54
transformers/models/dots1/modular_dots1.py +4 -5
transformers/models/dpr/configuration_dpr.py +2 -19
transformers/models/dpr/modeling_dpr.py +39 -42
transformers/models/dpr/tokenization_dpr.py +9 -19
transformers/models/dpr/tokenization_dpr_fast.py +9 -7
transformers/models/dpt/configuration_dpt.py +2 -1
transformers/models/dpt/image_processing_dpt.py +66 -65
transformers/models/dpt/image_processing_dpt_fast.py +20 -18
transformers/models/dpt/modeling_dpt.py +30 -32
transformers/models/dpt/modular_dpt.py +17 -15
transformers/models/edgetam/configuration_edgetam.py +3 -2
transformers/models/edgetam/modeling_edgetam.py +86 -86
transformers/models/edgetam/modular_edgetam.py +26 -21
transformers/models/edgetam_video/__init__.py +1 -0
transformers/models/edgetam_video/configuration_edgetam_video.py +1 -0
transformers/models/edgetam_video/modeling_edgetam_video.py +158 -169
transformers/models/edgetam_video/modular_edgetam_video.py +37 -30
transformers/models/efficientloftr/configuration_efficientloftr.py +5 -4
transformers/models/efficientloftr/image_processing_efficientloftr.py +16 -14
transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +9 -9
transformers/models/efficientloftr/modeling_efficientloftr.py +38 -59
transformers/models/efficientloftr/modular_efficientloftr.py +3 -1
transformers/models/efficientnet/configuration_efficientnet.py +1 -0
transformers/models/efficientnet/image_processing_efficientnet.py +32 -28
transformers/models/efficientnet/image_processing_efficientnet_fast.py +19 -17
transformers/models/efficientnet/modeling_efficientnet.py +15 -19
transformers/models/electra/configuration_electra.py +3 -13
transformers/models/electra/modeling_electra.py +103 -108
transformers/models/emu3/configuration_emu3.py +17 -13
transformers/models/emu3/image_processing_emu3.py +39 -44
transformers/models/emu3/modeling_emu3.py +108 -148
transformers/models/emu3/modular_emu3.py +73 -115
transformers/models/emu3/processing_emu3.py +43 -18
transformers/models/encodec/configuration_encodec.py +4 -2
transformers/models/encodec/feature_extraction_encodec.py +13 -10
transformers/models/encodec/modeling_encodec.py +29 -39
transformers/models/encoder_decoder/configuration_encoder_decoder.py +2 -12
transformers/models/encoder_decoder/modeling_encoder_decoder.py +43 -37
transformers/models/eomt/configuration_eomt.py +1 -0
transformers/models/eomt/image_processing_eomt.py +56 -66
transformers/models/eomt/image_processing_eomt_fast.py +33 -76
transformers/models/eomt/modeling_eomt.py +18 -23
transformers/models/eomt/modular_eomt.py +13 -18
transformers/models/ernie/configuration_ernie.py +3 -24
transformers/models/ernie/modeling_ernie.py +132 -127
transformers/models/ernie/modular_ernie.py +103 -97
transformers/models/ernie4_5/configuration_ernie4_5.py +27 -23
transformers/models/ernie4_5/modeling_ernie4_5.py +38 -36
transformers/models/ernie4_5/modular_ernie4_5.py +4 -3
transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +36 -32
transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +55 -56
transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +46 -18
transformers/models/esm/configuration_esm.py +15 -11
transformers/models/esm/modeling_esm.py +34 -38
transformers/models/esm/modeling_esmfold.py +49 -53
transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
transformers/models/esm/openfold_utils/loss.py +2 -1
transformers/models/esm/openfold_utils/protein.py +16 -15
transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
transformers/models/esm/tokenization_esm.py +4 -2
transformers/models/evolla/configuration_evolla.py +40 -50
transformers/models/evolla/modeling_evolla.py +66 -71
transformers/models/evolla/modular_evolla.py +47 -53
transformers/models/evolla/processing_evolla.py +35 -23
transformers/models/exaone4/configuration_exaone4.py +25 -23
transformers/models/exaone4/modeling_exaone4.py +38 -35
transformers/models/exaone4/modular_exaone4.py +46 -44
transformers/models/falcon/configuration_falcon.py +26 -31
transformers/models/falcon/modeling_falcon.py +80 -82
transformers/models/falcon_h1/configuration_falcon_h1.py +51 -45
transformers/models/falcon_h1/modeling_falcon_h1.py +82 -85
transformers/models/falcon_h1/modular_falcon_h1.py +51 -56
transformers/models/falcon_mamba/configuration_falcon_mamba.py +2 -1
transformers/models/falcon_mamba/modeling_falcon_mamba.py +82 -75
transformers/models/falcon_mamba/modular_falcon_mamba.py +45 -28
transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +6 -2
transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +60 -76
transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +3 -2
transformers/models/flaubert/configuration_flaubert.py +5 -10
transformers/models/flaubert/modeling_flaubert.py +143 -145
transformers/models/flaubert/tokenization_flaubert.py +5 -3
transformers/models/flava/configuration_flava.py +6 -5
transformers/models/flava/image_processing_flava.py +67 -66
transformers/models/flava/image_processing_flava_fast.py +49 -46
transformers/models/flava/modeling_flava.py +136 -153
transformers/models/flava/processing_flava.py +12 -2
transformers/models/flex_olmo/__init__.py +1 -0
transformers/models/flex_olmo/configuration_flex_olmo.py +32 -28
transformers/models/flex_olmo/modeling_flex_olmo.py +47 -47
transformers/models/flex_olmo/modular_flex_olmo.py +44 -40
transformers/models/florence2/configuration_florence2.py +1 -0
transformers/models/florence2/modeling_florence2.py +69 -111
transformers/models/florence2/modular_florence2.py +101 -104
transformers/models/florence2/processing_florence2.py +47 -18
transformers/models/fnet/configuration_fnet.py +2 -6
transformers/models/fnet/modeling_fnet.py +80 -83
transformers/models/fnet/tokenization_fnet.py +1 -0
transformers/models/focalnet/configuration_focalnet.py +1 -0
transformers/models/focalnet/modeling_focalnet.py +45 -51
transformers/models/fsmt/configuration_fsmt.py +17 -12
transformers/models/fsmt/modeling_fsmt.py +48 -49
transformers/models/fsmt/tokenization_fsmt.py +5 -3
transformers/models/funnel/configuration_funnel.py +1 -8
transformers/models/funnel/modeling_funnel.py +93 -99
transformers/models/funnel/tokenization_funnel.py +27 -17
transformers/models/fuyu/configuration_fuyu.py +34 -28
transformers/models/fuyu/image_processing_fuyu.py +31 -29
transformers/models/fuyu/image_processing_fuyu_fast.py +17 -17
transformers/models/fuyu/modeling_fuyu.py +53 -53
transformers/models/fuyu/processing_fuyu.py +34 -23
transformers/models/gemma/configuration_gemma.py +30 -25
transformers/models/gemma/modeling_gemma.py +50 -46
transformers/models/gemma/modular_gemma.py +47 -42
transformers/models/gemma/tokenization_gemma.py +30 -10
transformers/models/gemma2/configuration_gemma2.py +35 -30
transformers/models/gemma2/modeling_gemma2.py +42 -39
transformers/models/gemma2/modular_gemma2.py +66 -63
transformers/models/gemma3/configuration_gemma3.py +44 -44
transformers/models/gemma3/image_processing_gemma3.py +31 -29
transformers/models/gemma3/image_processing_gemma3_fast.py +13 -11
transformers/models/gemma3/modeling_gemma3.py +207 -159
transformers/models/gemma3/modular_gemma3.py +204 -153
transformers/models/gemma3/processing_gemma3.py +5 -5
transformers/models/gemma3n/configuration_gemma3n.py +26 -36
transformers/models/gemma3n/feature_extraction_gemma3n.py +11 -9
transformers/models/gemma3n/modeling_gemma3n.py +356 -222
transformers/models/gemma3n/modular_gemma3n.py +207 -230
transformers/models/gemma3n/processing_gemma3n.py +26 -12
transformers/models/git/configuration_git.py +8 -5
transformers/models/git/modeling_git.py +204 -266
transformers/models/git/processing_git.py +14 -2
transformers/models/glm/configuration_glm.py +28 -24
transformers/models/glm/modeling_glm.py +40 -37
transformers/models/glm/modular_glm.py +7 -4
transformers/models/glm4/configuration_glm4.py +28 -24
transformers/models/glm4/modeling_glm4.py +42 -40
transformers/models/glm4/modular_glm4.py +10 -8
transformers/models/glm46v/configuration_glm46v.py +1 -0
transformers/models/glm46v/image_processing_glm46v.py +40 -35
transformers/models/glm46v/image_processing_glm46v_fast.py +9 -9
transformers/models/glm46v/modeling_glm46v.py +90 -137
transformers/models/glm46v/modular_glm46v.py +3 -4
transformers/models/glm46v/processing_glm46v.py +41 -7
transformers/models/glm46v/video_processing_glm46v.py +11 -9
transformers/models/glm4_moe/configuration_glm4_moe.py +32 -40
transformers/models/glm4_moe/modeling_glm4_moe.py +42 -45
transformers/models/glm4_moe/modular_glm4_moe.py +34 -42
transformers/models/glm4v/configuration_glm4v.py +20 -18
transformers/models/glm4v/image_processing_glm4v.py +40 -34
transformers/models/glm4v/image_processing_glm4v_fast.py +9 -8
transformers/models/glm4v/modeling_glm4v.py +205 -254
transformers/models/glm4v/modular_glm4v.py +224 -210
transformers/models/glm4v/processing_glm4v.py +41 -7
transformers/models/glm4v/video_processing_glm4v.py +11 -9
transformers/models/glm4v_moe/configuration_glm4v_moe.py +125 -136
transformers/models/glm4v_moe/modeling_glm4v_moe.py +368 -377
transformers/models/glm4v_moe/modular_glm4v_moe.py +169 -83
transformers/models/glpn/configuration_glpn.py +1 -0
transformers/models/glpn/image_processing_glpn.py +12 -11
transformers/models/glpn/image_processing_glpn_fast.py +13 -11
transformers/models/glpn/modeling_glpn.py +14 -16
transformers/models/got_ocr2/configuration_got_ocr2.py +12 -4
transformers/models/got_ocr2/image_processing_got_ocr2.py +24 -22
transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +11 -9
transformers/models/got_ocr2/modeling_got_ocr2.py +80 -77
transformers/models/got_ocr2/modular_got_ocr2.py +51 -54
transformers/models/got_ocr2/processing_got_ocr2.py +63 -42
transformers/models/gpt2/configuration_gpt2.py +2 -13
transformers/models/gpt2/modeling_gpt2.py +115 -120
transformers/models/gpt2/tokenization_gpt2.py +46 -15
transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +2 -5
transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +89 -79
transformers/models/gpt_neo/configuration_gpt_neo.py +2 -9
transformers/models/gpt_neo/modeling_gpt_neo.py +67 -83
transformers/models/gpt_neox/configuration_gpt_neox.py +25 -25
transformers/models/gpt_neox/modeling_gpt_neox.py +75 -76
transformers/models/gpt_neox/modular_gpt_neox.py +66 -67
transformers/models/gpt_neox/tokenization_gpt_neox.py +51 -9
transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +19 -24
transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +47 -46
transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +3 -1
transformers/models/gpt_oss/configuration_gpt_oss.py +28 -46
transformers/models/gpt_oss/modeling_gpt_oss.py +121 -83
transformers/models/gpt_oss/modular_gpt_oss.py +103 -64
transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
transformers/models/gptj/configuration_gptj.py +4 -4
transformers/models/gptj/modeling_gptj.py +87 -101
transformers/models/granite/configuration_granite.py +33 -28
transformers/models/granite/modeling_granite.py +46 -44
transformers/models/granite/modular_granite.py +31 -29
transformers/models/granite_speech/configuration_granite_speech.py +1 -0
transformers/models/granite_speech/feature_extraction_granite_speech.py +3 -1
transformers/models/granite_speech/modeling_granite_speech.py +52 -82
transformers/models/granite_speech/processing_granite_speech.py +4 -11
transformers/models/granitemoe/configuration_granitemoe.py +36 -31
transformers/models/granitemoe/modeling_granitemoe.py +46 -41
transformers/models/granitemoe/modular_granitemoe.py +27 -22
transformers/models/granitemoehybrid/__init__.py +1 -0
transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +47 -46
transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +93 -97
transformers/models/granitemoehybrid/modular_granitemoehybrid.py +21 -54
transformers/models/granitemoeshared/configuration_granitemoeshared.py +37 -33
transformers/models/granitemoeshared/modeling_granitemoeshared.py +61 -54
transformers/models/granitemoeshared/modular_granitemoeshared.py +21 -19
transformers/models/grounding_dino/configuration_grounding_dino.py +4 -6
transformers/models/grounding_dino/image_processing_grounding_dino.py +62 -60
transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +29 -28
transformers/models/grounding_dino/modeling_grounding_dino.py +140 -155
transformers/models/grounding_dino/modular_grounding_dino.py +3 -2
transformers/models/grounding_dino/processing_grounding_dino.py +38 -10
transformers/models/groupvit/configuration_groupvit.py +2 -4
transformers/models/groupvit/modeling_groupvit.py +93 -107
transformers/models/helium/configuration_helium.py +29 -25
transformers/models/helium/modeling_helium.py +40 -38
transformers/models/helium/modular_helium.py +7 -3
transformers/models/herbert/tokenization_herbert.py +28 -10
transformers/models/hgnet_v2/configuration_hgnet_v2.py +1 -0
transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -24
transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -24
transformers/models/hiera/configuration_hiera.py +1 -0
transformers/models/hiera/modeling_hiera.py +66 -72
transformers/models/hubert/configuration_hubert.py +2 -4
transformers/models/hubert/modeling_hubert.py +37 -42
transformers/models/hubert/modular_hubert.py +11 -13
transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +31 -26
transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +38 -35
transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +6 -4
transformers/models/hunyuan_v1_moe/__init__.py +1 -1
transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +36 -31
transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +42 -47
transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +9 -9
transformers/models/ibert/configuration_ibert.py +2 -4
transformers/models/ibert/modeling_ibert.py +62 -82
transformers/models/ibert/quant_modules.py +1 -0
transformers/models/idefics/configuration_idefics.py +8 -5
transformers/models/idefics/image_processing_idefics.py +15 -13
transformers/models/idefics/modeling_idefics.py +82 -75
transformers/models/idefics/perceiver.py +3 -1
transformers/models/idefics/processing_idefics.py +48 -32
transformers/models/idefics/vision.py +25 -24
transformers/models/idefics2/configuration_idefics2.py +3 -1
transformers/models/idefics2/image_processing_idefics2.py +32 -31
transformers/models/idefics2/image_processing_idefics2_fast.py +8 -8
transformers/models/idefics2/modeling_idefics2.py +101 -127
transformers/models/idefics2/processing_idefics2.py +68 -10
transformers/models/idefics3/configuration_idefics3.py +4 -1
transformers/models/idefics3/image_processing_idefics3.py +43 -42
transformers/models/idefics3/image_processing_idefics3_fast.py +15 -40
transformers/models/idefics3/modeling_idefics3.py +90 -115
transformers/models/idefics3/processing_idefics3.py +69 -15
transformers/models/ijepa/configuration_ijepa.py +1 -0
transformers/models/ijepa/modeling_ijepa.py +11 -10
transformers/models/ijepa/modular_ijepa.py +7 -5
transformers/models/imagegpt/configuration_imagegpt.py +2 -9
transformers/models/imagegpt/image_processing_imagegpt.py +18 -17
transformers/models/imagegpt/image_processing_imagegpt_fast.py +16 -11
transformers/models/imagegpt/modeling_imagegpt.py +65 -76
transformers/models/informer/configuration_informer.py +9 -6
transformers/models/informer/modeling_informer.py +86 -88
transformers/models/informer/modular_informer.py +16 -14
transformers/models/instructblip/configuration_instructblip.py +2 -2
transformers/models/instructblip/modeling_instructblip.py +63 -103
transformers/models/instructblip/processing_instructblip.py +36 -10
transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -2
transformers/models/instructblipvideo/modeling_instructblipvideo.py +139 -157
transformers/models/instructblipvideo/modular_instructblipvideo.py +64 -73
transformers/models/instructblipvideo/processing_instructblipvideo.py +33 -14
transformers/models/instructblipvideo/video_processing_instructblipvideo.py +8 -6
transformers/models/internvl/configuration_internvl.py +1 -0
transformers/models/internvl/modeling_internvl.py +106 -85
transformers/models/internvl/modular_internvl.py +67 -47
transformers/models/internvl/processing_internvl.py +45 -12
transformers/models/internvl/video_processing_internvl.py +12 -10
transformers/models/jamba/configuration_jamba.py +8 -5
transformers/models/jamba/modeling_jamba.py +66 -68
transformers/models/jamba/modular_jamba.py +55 -54
transformers/models/janus/configuration_janus.py +1 -0
transformers/models/janus/image_processing_janus.py +37 -35
transformers/models/janus/image_processing_janus_fast.py +20 -18
transformers/models/janus/modeling_janus.py +191 -115
transformers/models/janus/modular_janus.py +84 -133
transformers/models/janus/processing_janus.py +43 -17
transformers/models/jetmoe/configuration_jetmoe.py +26 -24
transformers/models/jetmoe/modeling_jetmoe.py +46 -43
transformers/models/jetmoe/modular_jetmoe.py +33 -31
transformers/models/kosmos2/configuration_kosmos2.py +9 -10
transformers/models/kosmos2/modeling_kosmos2.py +173 -208
transformers/models/kosmos2/processing_kosmos2.py +55 -40
transformers/models/kosmos2_5/__init__.py +1 -0
transformers/models/kosmos2_5/configuration_kosmos2_5.py +9 -8
transformers/models/kosmos2_5/image_processing_kosmos2_5.py +12 -10
transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +13 -4
transformers/models/kosmos2_5/modeling_kosmos2_5.py +118 -132
transformers/models/kosmos2_5/processing_kosmos2_5.py +29 -8
transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +28 -31
transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +14 -12
transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +100 -110
transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +22 -28
transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +8 -2
transformers/models/layoutlm/configuration_layoutlm.py +2 -14
transformers/models/layoutlm/modeling_layoutlm.py +72 -77
transformers/models/layoutlmv2/configuration_layoutlmv2.py +17 -14
transformers/models/layoutlmv2/image_processing_layoutlmv2.py +21 -18
transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +9 -7
transformers/models/layoutlmv2/modeling_layoutlmv2.py +50 -64
transformers/models/layoutlmv2/processing_layoutlmv2.py +44 -14
transformers/models/layoutlmv2/tokenization_layoutlmv2.py +126 -73
transformers/models/layoutlmv3/configuration_layoutlmv3.py +19 -16
transformers/models/layoutlmv3/image_processing_layoutlmv3.py +26 -24
transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +11 -9
transformers/models/layoutlmv3/modeling_layoutlmv3.py +56 -82
transformers/models/layoutlmv3/processing_layoutlmv3.py +46 -14
transformers/models/layoutlmv3/tokenization_layoutlmv3.py +134 -74
transformers/models/layoutxlm/configuration_layoutxlm.py +17 -14
transformers/models/layoutxlm/modular_layoutxlm.py +1 -0
transformers/models/layoutxlm/processing_layoutxlm.py +44 -14
transformers/models/layoutxlm/tokenization_layoutxlm.py +113 -77
transformers/models/led/configuration_led.py +12 -8
transformers/models/led/modeling_led.py +266 -124
transformers/models/levit/configuration_levit.py +1 -0
transformers/models/levit/image_processing_levit.py +21 -19
transformers/models/levit/image_processing_levit_fast.py +5 -4
transformers/models/levit/modeling_levit.py +19 -38
transformers/models/lfm2/configuration_lfm2.py +30 -27
transformers/models/lfm2/modeling_lfm2.py +50 -47
transformers/models/lfm2/modular_lfm2.py +30 -29
transformers/models/lfm2_moe/__init__.py +1 -0
transformers/models/lfm2_moe/configuration_lfm2_moe.py +9 -6
transformers/models/lfm2_moe/modeling_lfm2_moe.py +53 -61
transformers/models/lfm2_moe/modular_lfm2_moe.py +37 -13
transformers/models/lfm2_vl/configuration_lfm2_vl.py +1 -4
transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +12 -41
transformers/models/lfm2_vl/modeling_lfm2_vl.py +66 -84
transformers/models/lfm2_vl/modular_lfm2_vl.py +56 -70
transformers/models/lfm2_vl/processing_lfm2_vl.py +76 -96
transformers/models/lightglue/image_processing_lightglue.py +15 -16
transformers/models/lightglue/image_processing_lightglue_fast.py +9 -9
transformers/models/lightglue/modeling_lightglue.py +31 -31
transformers/models/lightglue/modular_lightglue.py +28 -29
transformers/models/lilt/configuration_lilt.py +2 -6
transformers/models/lilt/modeling_lilt.py +70 -76
transformers/models/llama/configuration_llama.py +31 -26
transformers/models/llama/modeling_llama.py +39 -36
transformers/models/llama/tokenization_llama.py +44 -14
transformers/models/llama4/configuration_llama4.py +30 -27
transformers/models/llama4/image_processing_llama4_fast.py +14 -12
transformers/models/llama4/modeling_llama4.py +113 -120
transformers/models/llama4/processing_llama4.py +57 -33
transformers/models/llava/configuration_llava.py +1 -10
transformers/models/llava/image_processing_llava.py +28 -25
transformers/models/llava/image_processing_llava_fast.py +11 -9
transformers/models/llava/modeling_llava.py +109 -85
transformers/models/llava/processing_llava.py +51 -18
transformers/models/llava_next/configuration_llava_next.py +2 -2
transformers/models/llava_next/image_processing_llava_next.py +45 -43
transformers/models/llava_next/image_processing_llava_next_fast.py +13 -11
transformers/models/llava_next/modeling_llava_next.py +107 -110
transformers/models/llava_next/processing_llava_next.py +47 -18
transformers/models/llava_next_video/configuration_llava_next_video.py +7 -4
transformers/models/llava_next_video/modeling_llava_next_video.py +158 -175
transformers/models/llava_next_video/modular_llava_next_video.py +150 -155
transformers/models/llava_next_video/processing_llava_next_video.py +63 -21
transformers/models/llava_next_video/video_processing_llava_next_video.py +1 -0
transformers/models/llava_onevision/configuration_llava_onevision.py +7 -4
transformers/models/llava_onevision/image_processing_llava_onevision.py +42 -40
transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +15 -14
transformers/models/llava_onevision/modeling_llava_onevision.py +169 -177
transformers/models/llava_onevision/modular_llava_onevision.py +156 -163
transformers/models/llava_onevision/processing_llava_onevision.py +53 -21
transformers/models/llava_onevision/video_processing_llava_onevision.py +1 -0
transformers/models/longcat_flash/__init__.py +1 -0
transformers/models/longcat_flash/configuration_longcat_flash.py +42 -37
transformers/models/longcat_flash/modeling_longcat_flash.py +36 -36
transformers/models/longcat_flash/modular_longcat_flash.py +21 -21
transformers/models/longformer/configuration_longformer.py +5 -5
transformers/models/longformer/modeling_longformer.py +101 -105
transformers/models/longt5/configuration_longt5.py +7 -9
transformers/models/longt5/modeling_longt5.py +49 -49
transformers/models/luke/configuration_luke.py +2 -8
transformers/models/luke/modeling_luke.py +181 -188
transformers/models/luke/tokenization_luke.py +140 -107
transformers/models/lxmert/configuration_lxmert.py +1 -16
transformers/models/lxmert/modeling_lxmert.py +74 -65
transformers/models/m2m_100/configuration_m2m_100.py +9 -7
transformers/models/m2m_100/modeling_m2m_100.py +71 -83
transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
transformers/models/mamba/configuration_mamba.py +2 -1
transformers/models/mamba/modeling_mamba.py +66 -58
transformers/models/mamba2/configuration_mamba2.py +8 -5
transformers/models/mamba2/modeling_mamba2.py +69 -68
transformers/models/marian/configuration_marian.py +5 -10
transformers/models/marian/modeling_marian.py +87 -93
transformers/models/marian/tokenization_marian.py +6 -6
transformers/models/markuplm/configuration_markuplm.py +7 -4
transformers/models/markuplm/feature_extraction_markuplm.py +2 -1
transformers/models/markuplm/modeling_markuplm.py +70 -69
transformers/models/markuplm/processing_markuplm.py +38 -31
transformers/models/markuplm/tokenization_markuplm.py +136 -93
transformers/models/mask2former/configuration_mask2former.py +8 -5
transformers/models/mask2former/image_processing_mask2former.py +85 -84
transformers/models/mask2former/image_processing_mask2former_fast.py +40 -37
transformers/models/mask2former/modeling_mask2former.py +103 -118
transformers/models/mask2former/modular_mask2former.py +8 -6
transformers/models/maskformer/configuration_maskformer.py +9 -6
transformers/models/maskformer/configuration_maskformer_swin.py +1 -0
transformers/models/maskformer/image_processing_maskformer.py +85 -84
transformers/models/maskformer/image_processing_maskformer_fast.py +40 -36
transformers/models/maskformer/modeling_maskformer.py +65 -79
transformers/models/maskformer/modeling_maskformer_swin.py +32 -36
transformers/models/mbart/configuration_mbart.py +4 -9
transformers/models/mbart/modeling_mbart.py +116 -131
transformers/models/mbart/tokenization_mbart.py +54 -11
transformers/models/mbart50/tokenization_mbart50.py +13 -8
transformers/models/megatron_bert/configuration_megatron_bert.py +3 -13
transformers/models/megatron_bert/modeling_megatron_bert.py +150 -148
transformers/models/metaclip_2/configuration_metaclip_2.py +1 -4
transformers/models/metaclip_2/modeling_metaclip_2.py +84 -91
transformers/models/metaclip_2/modular_metaclip_2.py +45 -61
transformers/models/mgp_str/configuration_mgp_str.py +1 -0
transformers/models/mgp_str/modeling_mgp_str.py +18 -20
transformers/models/mgp_str/processing_mgp_str.py +20 -3
transformers/models/mgp_str/tokenization_mgp_str.py +3 -1
transformers/models/mimi/configuration_mimi.py +40 -42
transformers/models/mimi/modeling_mimi.py +113 -142
transformers/models/minimax/__init__.py +1 -0
transformers/models/minimax/configuration_minimax.py +43 -37
transformers/models/minimax/modeling_minimax.py +51 -61
transformers/models/minimax/modular_minimax.py +62 -68
transformers/models/ministral/configuration_ministral.py +29 -25
transformers/models/ministral/modeling_ministral.py +38 -36
transformers/models/ministral/modular_ministral.py +37 -32
transformers/models/ministral3/configuration_ministral3.py +27 -24
transformers/models/ministral3/modeling_ministral3.py +37 -36
transformers/models/ministral3/modular_ministral3.py +5 -4
transformers/models/mistral/configuration_mistral.py +29 -24
transformers/models/mistral/modeling_mistral.py +37 -36
transformers/models/mistral/modular_mistral.py +12 -11
transformers/models/mistral3/configuration_mistral3.py +1 -4
transformers/models/mistral3/modeling_mistral3.py +86 -89
transformers/models/mistral3/modular_mistral3.py +68 -69
transformers/models/mixtral/configuration_mixtral.py +34 -29
transformers/models/mixtral/modeling_mixtral.py +45 -50
transformers/models/mixtral/modular_mixtral.py +31 -32
transformers/models/mlcd/configuration_mlcd.py +1 -0
transformers/models/mlcd/modeling_mlcd.py +14 -20
transformers/models/mlcd/modular_mlcd.py +13 -17
transformers/models/mllama/configuration_mllama.py +15 -10
transformers/models/mllama/image_processing_mllama.py +25 -23
transformers/models/mllama/image_processing_mllama_fast.py +11 -11
transformers/models/mllama/modeling_mllama.py +94 -105
transformers/models/mllama/processing_mllama.py +55 -6
transformers/models/mluke/tokenization_mluke.py +107 -101
transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +3 -5
transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +140 -155
transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +3 -5
transformers/models/mobilebert/configuration_mobilebert.py +2 -4
transformers/models/mobilebert/modeling_mobilebert.py +85 -77
transformers/models/mobilebert/tokenization_mobilebert.py +1 -0
transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +1 -0
transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +23 -20
transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +1 -0
transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +16 -15
transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +1 -0
transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +51 -48
transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +15 -13
transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +22 -24
transformers/models/mobilevit/configuration_mobilevit.py +1 -0
transformers/models/mobilevit/image_processing_mobilevit.py +49 -46
transformers/models/mobilevit/image_processing_mobilevit_fast.py +14 -12
transformers/models/mobilevit/modeling_mobilevit.py +21 -28
transformers/models/mobilevitv2/configuration_mobilevitv2.py +1 -0
transformers/models/mobilevitv2/modeling_mobilevitv2.py +22 -28
transformers/models/modernbert/configuration_modernbert.py +42 -44
transformers/models/modernbert/modeling_modernbert.py +133 -145
transformers/models/modernbert/modular_modernbert.py +170 -186
transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +40 -40
transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +57 -62
transformers/models/modernbert_decoder/modular_modernbert_decoder.py +86 -94
transformers/models/moonshine/configuration_moonshine.py +31 -34
transformers/models/moonshine/modeling_moonshine.py +71 -71
transformers/models/moonshine/modular_moonshine.py +83 -88
transformers/models/moshi/configuration_moshi.py +23 -46
transformers/models/moshi/modeling_moshi.py +187 -157
transformers/models/mpnet/configuration_mpnet.py +2 -6
transformers/models/mpnet/modeling_mpnet.py +57 -62
transformers/models/mpnet/tokenization_mpnet.py +15 -4
transformers/models/mpt/configuration_mpt.py +9 -5
transformers/models/mpt/modeling_mpt.py +60 -60
transformers/models/mra/configuration_mra.py +2 -8
transformers/models/mra/modeling_mra.py +57 -64
transformers/models/mt5/configuration_mt5.py +8 -10
transformers/models/mt5/modeling_mt5.py +95 -87
transformers/models/musicgen/configuration_musicgen.py +8 -12
transformers/models/musicgen/modeling_musicgen.py +122 -118
transformers/models/musicgen/processing_musicgen.py +21 -3
transformers/models/musicgen_melody/configuration_musicgen_melody.py +8 -15
transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +9 -8
transformers/models/musicgen_melody/modeling_musicgen_melody.py +123 -117
transformers/models/musicgen_melody/processing_musicgen_melody.py +22 -3
transformers/models/mvp/configuration_mvp.py +5 -8
transformers/models/mvp/modeling_mvp.py +123 -135
transformers/models/myt5/tokenization_myt5.py +10 -8
transformers/models/nanochat/configuration_nanochat.py +8 -5
transformers/models/nanochat/modeling_nanochat.py +40 -37
transformers/models/nanochat/modular_nanochat.py +14 -12
transformers/models/nemotron/configuration_nemotron.py +30 -25
transformers/models/nemotron/modeling_nemotron.py +57 -56
transformers/models/nllb/tokenization_nllb.py +28 -12
transformers/models/nllb_moe/configuration_nllb_moe.py +9 -7
transformers/models/nllb_moe/modeling_nllb_moe.py +69 -77
transformers/models/nougat/image_processing_nougat.py +32 -29
transformers/models/nougat/image_processing_nougat_fast.py +14 -12
transformers/models/nougat/processing_nougat.py +39 -37
transformers/models/nougat/tokenization_nougat.py +73 -18
transformers/models/nystromformer/configuration_nystromformer.py +2 -8
transformers/models/nystromformer/modeling_nystromformer.py +63 -74
transformers/models/olmo/configuration_olmo.py +28 -23
transformers/models/olmo/modeling_olmo.py +39 -36
transformers/models/olmo/modular_olmo.py +11 -7
transformers/models/olmo2/configuration_olmo2.py +28 -23
transformers/models/olmo2/modeling_olmo2.py +41 -37
transformers/models/olmo2/modular_olmo2.py +32 -29
transformers/models/olmo3/__init__.py +1 -0
transformers/models/olmo3/configuration_olmo3.py +30 -26
transformers/models/olmo3/modeling_olmo3.py +39 -36
transformers/models/olmo3/modular_olmo3.py +40 -37
transformers/models/olmoe/configuration_olmoe.py +33 -29
transformers/models/olmoe/modeling_olmoe.py +46 -52
transformers/models/olmoe/modular_olmoe.py +15 -16
transformers/models/omdet_turbo/configuration_omdet_turbo.py +4 -2
transformers/models/omdet_turbo/modeling_omdet_turbo.py +47 -53
transformers/models/omdet_turbo/processing_omdet_turbo.py +67 -19
transformers/models/oneformer/configuration_oneformer.py +8 -5
transformers/models/oneformer/image_processing_oneformer.py +84 -83
transformers/models/oneformer/image_processing_oneformer_fast.py +42 -41
transformers/models/oneformer/modeling_oneformer.py +171 -147
transformers/models/oneformer/processing_oneformer.py +43 -28
transformers/models/openai/configuration_openai.py +1 -16
transformers/models/openai/modeling_openai.py +51 -65
transformers/models/openai/tokenization_openai.py +47 -8
transformers/models/opt/configuration_opt.py +7 -6
transformers/models/opt/modeling_opt.py +76 -78
transformers/models/ovis2/__init__.py +1 -0
transformers/models/ovis2/configuration_ovis2.py +1 -0
transformers/models/ovis2/image_processing_ovis2.py +24 -22
transformers/models/ovis2/image_processing_ovis2_fast.py +11 -9
transformers/models/ovis2/modeling_ovis2.py +142 -111
transformers/models/ovis2/modular_ovis2.py +45 -90
transformers/models/ovis2/processing_ovis2.py +40 -12
transformers/models/owlv2/configuration_owlv2.py +2 -4
transformers/models/owlv2/image_processing_owlv2.py +21 -20
transformers/models/owlv2/image_processing_owlv2_fast.py +15 -12
transformers/models/owlv2/modeling_owlv2.py +117 -133
transformers/models/owlv2/modular_owlv2.py +14 -11
transformers/models/owlv2/processing_owlv2.py +49 -20
transformers/models/owlvit/configuration_owlvit.py +2 -4
transformers/models/owlvit/image_processing_owlvit.py +22 -21
transformers/models/owlvit/image_processing_owlvit_fast.py +3 -2
transformers/models/owlvit/modeling_owlvit.py +116 -132
transformers/models/owlvit/processing_owlvit.py +48 -20
transformers/models/paligemma/configuration_paligemma.py +1 -4
transformers/models/paligemma/modeling_paligemma.py +93 -103
transformers/models/paligemma/processing_paligemma.py +66 -13
transformers/models/parakeet/configuration_parakeet.py +14 -7
transformers/models/parakeet/feature_extraction_parakeet.py +12 -10
transformers/models/parakeet/modeling_parakeet.py +28 -32
transformers/models/parakeet/modular_parakeet.py +20 -23
transformers/models/parakeet/processing_parakeet.py +5 -13
transformers/models/parakeet/{tokenization_parakeet.py → tokenization_parakeet_fast.py} +7 -5
transformers/models/patchtsmixer/configuration_patchtsmixer.py +8 -5
transformers/models/patchtsmixer/modeling_patchtsmixer.py +62 -70
transformers/models/patchtst/configuration_patchtst.py +9 -6
transformers/models/patchtst/modeling_patchtst.py +80 -97
transformers/models/pegasus/configuration_pegasus.py +5 -8
transformers/models/pegasus/modeling_pegasus.py +66 -72
transformers/models/pegasus/tokenization_pegasus.py +45 -15
transformers/models/pegasus_x/configuration_pegasus_x.py +4 -5
transformers/models/pegasus_x/modeling_pegasus_x.py +52 -55
transformers/models/perceiver/configuration_perceiver.py +1 -0
transformers/models/perceiver/image_processing_perceiver.py +25 -22
transformers/models/perceiver/image_processing_perceiver_fast.py +9 -7
transformers/models/perceiver/modeling_perceiver.py +146 -165
transformers/models/perceiver/tokenization_perceiver.py +6 -3
transformers/models/perception_lm/configuration_perception_lm.py +1 -0
transformers/models/perception_lm/image_processing_perception_lm_fast.py +10 -8
transformers/models/perception_lm/modeling_perception_lm.py +70 -71
transformers/models/perception_lm/modular_perception_lm.py +61 -65
transformers/models/perception_lm/processing_perception_lm.py +47 -13
transformers/models/perception_lm/video_processing_perception_lm.py +1 -0
transformers/models/persimmon/configuration_persimmon.py +28 -23
transformers/models/persimmon/modeling_persimmon.py +45 -43
transformers/models/phi/configuration_phi.py +28 -23
transformers/models/phi/modeling_phi.py +43 -40
transformers/models/phi/modular_phi.py +24 -23
transformers/models/phi3/configuration_phi3.py +33 -28
transformers/models/phi3/modeling_phi3.py +38 -36
transformers/models/phi3/modular_phi3.py +17 -13
transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +33 -30
transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +9 -7
transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +11 -11
transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +78 -95
transformers/models/phi4_multimodal/modular_phi4_multimodal.py +80 -98
transformers/models/phi4_multimodal/processing_phi4_multimodal.py +44 -7
transformers/models/phimoe/configuration_phimoe.py +36 -31
transformers/models/phimoe/modeling_phimoe.py +45 -50
transformers/models/phimoe/modular_phimoe.py +4 -3
transformers/models/phobert/tokenization_phobert.py +6 -4
transformers/models/pix2struct/configuration_pix2struct.py +10 -12
transformers/models/pix2struct/image_processing_pix2struct.py +19 -15
transformers/models/pix2struct/image_processing_pix2struct_fast.py +15 -12
transformers/models/pix2struct/modeling_pix2struct.py +52 -58
transformers/models/pix2struct/processing_pix2struct.py +30 -5
transformers/models/pixtral/configuration_pixtral.py +14 -11
transformers/models/pixtral/image_processing_pixtral.py +28 -26
transformers/models/pixtral/image_processing_pixtral_fast.py +11 -10
transformers/models/pixtral/modeling_pixtral.py +34 -28
transformers/models/pixtral/processing_pixtral.py +53 -21
transformers/models/plbart/configuration_plbart.py +5 -8
transformers/models/plbart/modeling_plbart.py +106 -119
transformers/models/plbart/modular_plbart.py +33 -39
transformers/models/plbart/tokenization_plbart.py +7 -4
transformers/models/poolformer/configuration_poolformer.py +1 -0
transformers/models/poolformer/image_processing_poolformer.py +24 -21
transformers/models/poolformer/image_processing_poolformer_fast.py +15 -13
transformers/models/poolformer/modeling_poolformer.py +13 -23
transformers/models/pop2piano/configuration_pop2piano.py +8 -7
transformers/models/pop2piano/feature_extraction_pop2piano.py +9 -6
transformers/models/pop2piano/modeling_pop2piano.py +24 -26
transformers/models/pop2piano/processing_pop2piano.py +33 -25
transformers/models/pop2piano/tokenization_pop2piano.py +23 -15
transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +3 -3
transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +21 -20
transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +13 -16
transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +13 -16
transformers/models/prophetnet/configuration_prophetnet.py +38 -37
transformers/models/prophetnet/modeling_prophetnet.py +131 -114
transformers/models/prophetnet/tokenization_prophetnet.py +16 -14
transformers/models/pvt/configuration_pvt.py +1 -0
transformers/models/pvt/image_processing_pvt.py +27 -24
transformers/models/pvt/image_processing_pvt_fast.py +2 -1
transformers/models/pvt/modeling_pvt.py +21 -21
transformers/models/pvt_v2/configuration_pvt_v2.py +4 -2
transformers/models/pvt_v2/modeling_pvt_v2.py +25 -28
transformers/models/qwen2/configuration_qwen2.py +25 -32
transformers/models/qwen2/modeling_qwen2.py +38 -36
transformers/models/qwen2/modular_qwen2.py +12 -11
transformers/models/qwen2/tokenization_qwen2.py +23 -12
transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +26 -32
transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +277 -340
transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +211 -278
transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +49 -41
transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +35 -29
transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +148 -203
transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +118 -93
transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +43 -7
transformers/models/qwen2_audio/configuration_qwen2_audio.py +1 -0
transformers/models/qwen2_audio/modeling_qwen2_audio.py +40 -40
transformers/models/qwen2_audio/processing_qwen2_audio.py +42 -13
transformers/models/qwen2_moe/configuration_qwen2_moe.py +35 -42
transformers/models/qwen2_moe/modeling_qwen2_moe.py +46 -51
transformers/models/qwen2_moe/modular_qwen2_moe.py +10 -7
transformers/models/qwen2_vl/configuration_qwen2_vl.py +34 -29
transformers/models/qwen2_vl/image_processing_qwen2_vl.py +42 -41
transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +15 -12
transformers/models/qwen2_vl/modeling_qwen2_vl.py +153 -199
transformers/models/qwen2_vl/processing_qwen2_vl.py +44 -7
transformers/models/qwen2_vl/video_processing_qwen2_vl.py +18 -38
transformers/models/qwen3/configuration_qwen3.py +27 -34
transformers/models/qwen3/modeling_qwen3.py +39 -36
transformers/models/qwen3/modular_qwen3.py +6 -4
transformers/models/qwen3_moe/configuration_qwen3_moe.py +32 -39
transformers/models/qwen3_moe/modeling_qwen3_moe.py +46 -51
transformers/models/qwen3_moe/modular_qwen3_moe.py +13 -10
transformers/models/qwen3_next/configuration_qwen3_next.py +35 -45
transformers/models/qwen3_next/modeling_qwen3_next.py +51 -47
transformers/models/qwen3_next/modular_qwen3_next.py +35 -34
transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +101 -135
transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +252 -355
transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +196 -250
transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +48 -40
transformers/models/qwen3_vl/configuration_qwen3_vl.py +29 -27
transformers/models/qwen3_vl/modeling_qwen3_vl.py +155 -233
transformers/models/qwen3_vl/modular_qwen3_vl.py +179 -206
transformers/models/qwen3_vl/processing_qwen3_vl.py +42 -6
transformers/models/qwen3_vl/video_processing_qwen3_vl.py +12 -10
transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +30 -23
transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +303 -358
transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +124 -87
transformers/models/rag/configuration_rag.py +15 -6
transformers/models/rag/modeling_rag.py +130 -127
transformers/models/rag/retrieval_rag.py +5 -3
transformers/models/rag/tokenization_rag.py +50 -0
transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +30 -29
transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +42 -53
transformers/models/reformer/configuration_reformer.py +8 -7
transformers/models/reformer/modeling_reformer.py +69 -80
transformers/models/reformer/tokenization_reformer.py +31 -11
transformers/models/regnet/configuration_regnet.py +1 -0
transformers/models/regnet/modeling_regnet.py +8 -15
transformers/models/rembert/configuration_rembert.py +2 -8
transformers/models/rembert/modeling_rembert.py +111 -121
transformers/models/rembert/tokenization_rembert.py +12 -2
transformers/models/resnet/configuration_resnet.py +1 -0
transformers/models/resnet/modeling_resnet.py +13 -27
transformers/models/roberta/configuration_roberta.py +3 -11
transformers/models/roberta/modeling_roberta.py +93 -94
transformers/models/roberta/modular_roberta.py +58 -58
transformers/models/roberta/tokenization_roberta.py +29 -17
transformers/models/roberta/tokenization_roberta_old.py +4 -2
transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +3 -11
transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +93 -94
transformers/models/roc_bert/configuration_roc_bert.py +2 -8
transformers/models/roc_bert/modeling_roc_bert.py +121 -122
transformers/models/roc_bert/tokenization_roc_bert.py +94 -88
transformers/models/roformer/configuration_roformer.py +3 -13
transformers/models/roformer/modeling_roformer.py +81 -85
transformers/models/roformer/tokenization_roformer.py +412 -74
transformers/models/roformer/tokenization_roformer_fast.py +160 -0
transformers/models/roformer/tokenization_utils.py +1 -0
transformers/models/rt_detr/configuration_rt_detr.py +2 -1
transformers/models/rt_detr/configuration_rt_detr_resnet.py +1 -0
transformers/models/rt_detr/image_processing_rt_detr.py +55 -54
transformers/models/rt_detr/image_processing_rt_detr_fast.py +26 -26
transformers/models/rt_detr/modeling_rt_detr.py +90 -99
transformers/models/rt_detr/modeling_rt_detr_resnet.py +6 -13
transformers/models/rt_detr/modular_rt_detr.py +16 -16
transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +4 -6
transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +90 -101
transformers/models/rt_detr_v2/modular_rt_detr_v2.py +12 -19
transformers/models/rwkv/configuration_rwkv.py +4 -2
transformers/models/rwkv/modeling_rwkv.py +32 -31
transformers/models/sam/configuration_sam.py +1 -3
transformers/models/sam/image_processing_sam.py +60 -59
transformers/models/sam/image_processing_sam_fast.py +27 -25
transformers/models/sam/modeling_sam.py +41 -47
transformers/models/sam/processing_sam.py +27 -39
transformers/models/sam2/configuration_sam2.py +3 -2
transformers/models/sam2/image_processing_sam2_fast.py +15 -14
transformers/models/sam2/modeling_sam2.py +90 -96
transformers/models/sam2/modular_sam2.py +91 -86
transformers/models/sam2/processing_sam2.py +47 -31
transformers/models/sam2_video/configuration_sam2_video.py +1 -0
transformers/models/sam2_video/modeling_sam2_video.py +144 -151
transformers/models/sam2_video/modular_sam2_video.py +104 -101
transformers/models/sam2_video/processing_sam2_video.py +66 -49
transformers/models/sam2_video/video_processing_sam2_video.py +4 -1
transformers/models/sam3/configuration_sam3.py +2 -21
transformers/models/sam3/image_processing_sam3_fast.py +20 -17
transformers/models/sam3/modeling_sam3.py +170 -184
transformers/models/sam3/modular_sam3.py +8 -3
transformers/models/sam3/processing_sam3.py +52 -37
transformers/models/sam3_tracker/__init__.py +1 -0
transformers/models/sam3_tracker/configuration_sam3_tracker.py +3 -1
transformers/models/sam3_tracker/modeling_sam3_tracker.py +77 -82
transformers/models/sam3_tracker/modular_sam3_tracker.py +3 -8
transformers/models/sam3_tracker/processing_sam3_tracker.py +48 -31
transformers/models/sam3_tracker_video/__init__.py +1 -0
transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +1 -25
transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +122 -135
transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +26 -35
transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +66 -50
transformers/models/sam3_video/configuration_sam3_video.py +1 -14
transformers/models/sam3_video/modeling_sam3_video.py +34 -33
transformers/models/sam3_video/processing_sam3_video.py +46 -26
transformers/models/sam_hq/__init__.py +1 -1
transformers/models/sam_hq/configuration_sam_hq.py +1 -3
transformers/models/sam_hq/modeling_sam_hq.py +69 -74
transformers/models/sam_hq/modular_sam_hq.py +25 -23
transformers/models/sam_hq/{processing_sam_hq.py → processing_samhq.py} +29 -41
transformers/models/seamless_m4t/configuration_seamless_m4t.py +10 -8
transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +11 -8
transformers/models/seamless_m4t/modeling_seamless_m4t.py +194 -212
transformers/models/seamless_m4t/processing_seamless_m4t.py +39 -18
transformers/models/seamless_m4t/tokenization_seamless_m4t.py +77 -40
transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +10 -8
transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +196 -204
transformers/models/seed_oss/configuration_seed_oss.py +32 -28
transformers/models/seed_oss/modeling_seed_oss.py +35 -33
transformers/models/seed_oss/modular_seed_oss.py +4 -3
transformers/models/segformer/configuration_segformer.py +10 -0
transformers/models/segformer/image_processing_segformer.py +42 -39
transformers/models/segformer/image_processing_segformer_fast.py +12 -10
transformers/models/segformer/modeling_segformer.py +31 -34
transformers/models/segformer/modular_segformer.py +10 -8
transformers/models/seggpt/configuration_seggpt.py +1 -0
transformers/models/seggpt/image_processing_seggpt.py +41 -38
transformers/models/seggpt/modeling_seggpt.py +38 -50
transformers/models/sew/configuration_sew.py +2 -4
transformers/models/sew/modeling_sew.py +36 -38
transformers/models/sew/modular_sew.py +13 -13
transformers/models/sew_d/configuration_sew_d.py +2 -4
transformers/models/sew_d/modeling_sew_d.py +30 -31
transformers/models/shieldgemma2/configuration_shieldgemma2.py +1 -0
transformers/models/shieldgemma2/modeling_shieldgemma2.py +17 -16
transformers/models/shieldgemma2/processing_shieldgemma2.py +5 -3
transformers/models/siglip/configuration_siglip.py +2 -4
transformers/models/siglip/image_processing_siglip.py +20 -17
transformers/models/siglip/image_processing_siglip_fast.py +1 -0
transformers/models/siglip/modeling_siglip.py +75 -84
transformers/models/siglip/processing_siglip.py +14 -2
transformers/models/siglip/tokenization_siglip.py +7 -6
transformers/models/siglip2/configuration_siglip2.py +2 -5
transformers/models/siglip2/image_processing_siglip2.py +16 -15
transformers/models/siglip2/image_processing_siglip2_fast.py +7 -6
transformers/models/siglip2/modeling_siglip2.py +129 -143
transformers/models/siglip2/modular_siglip2.py +46 -47
transformers/models/siglip2/processing_siglip2.py +14 -2
transformers/models/smollm3/configuration_smollm3.py +32 -29
transformers/models/smollm3/modeling_smollm3.py +39 -36
transformers/models/smollm3/modular_smollm3.py +35 -33
transformers/models/smolvlm/configuration_smolvlm.py +4 -2
transformers/models/smolvlm/image_processing_smolvlm.py +43 -42
transformers/models/smolvlm/image_processing_smolvlm_fast.py +15 -41
transformers/models/smolvlm/modeling_smolvlm.py +94 -126
transformers/models/smolvlm/modular_smolvlm.py +39 -50
transformers/models/smolvlm/processing_smolvlm.py +83 -15
transformers/models/smolvlm/video_processing_smolvlm.py +18 -16
transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +1 -0
transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +27 -26
transformers/models/speech_to_text/configuration_speech_to_text.py +9 -9
transformers/models/speech_to_text/feature_extraction_speech_to_text.py +13 -10
transformers/models/speech_to_text/modeling_speech_to_text.py +54 -66
transformers/models/speech_to_text/processing_speech_to_text.py +30 -4
transformers/models/speech_to_text/tokenization_speech_to_text.py +6 -5
transformers/models/speecht5/configuration_speecht5.py +9 -7
transformers/models/speecht5/feature_extraction_speecht5.py +37 -16
transformers/models/speecht5/modeling_speecht5.py +175 -213
transformers/models/speecht5/number_normalizer.py +1 -0
transformers/models/speecht5/processing_speecht5.py +37 -3
transformers/models/speecht5/tokenization_speecht5.py +5 -4
transformers/models/splinter/configuration_splinter.py +7 -6
transformers/models/splinter/modeling_splinter.py +59 -71
transformers/models/splinter/tokenization_splinter.py +30 -9
transformers/models/squeezebert/configuration_squeezebert.py +2 -14
transformers/models/squeezebert/modeling_squeezebert.py +62 -68
transformers/models/squeezebert/tokenization_squeezebert.py +1 -0
transformers/models/stablelm/configuration_stablelm.py +29 -24
transformers/models/stablelm/modeling_stablelm.py +45 -44
transformers/models/starcoder2/configuration_starcoder2.py +27 -30
transformers/models/starcoder2/modeling_starcoder2.py +41 -39
transformers/models/starcoder2/modular_starcoder2.py +16 -14
transformers/models/superglue/configuration_superglue.py +3 -7
transformers/models/superglue/image_processing_superglue.py +15 -15
transformers/models/superglue/image_processing_superglue_fast.py +10 -9
transformers/models/superglue/modeling_superglue.py +37 -42
transformers/models/superpoint/image_processing_superpoint.py +15 -15
transformers/models/superpoint/image_processing_superpoint_fast.py +11 -8
transformers/models/superpoint/modeling_superpoint.py +16 -18
transformers/models/swiftformer/configuration_swiftformer.py +1 -0
transformers/models/swiftformer/modeling_swiftformer.py +14 -18
transformers/models/swin/configuration_swin.py +1 -0
transformers/models/swin/modeling_swin.py +86 -86
transformers/models/swin2sr/configuration_swin2sr.py +1 -0
transformers/models/swin2sr/image_processing_swin2sr.py +13 -10
transformers/models/swin2sr/image_processing_swin2sr_fast.py +8 -4
transformers/models/swin2sr/modeling_swin2sr.py +63 -81
transformers/models/swinv2/configuration_swinv2.py +1 -0
transformers/models/swinv2/modeling_swinv2.py +104 -108
transformers/models/switch_transformers/configuration_switch_transformers.py +7 -11
transformers/models/switch_transformers/modeling_switch_transformers.py +44 -37
transformers/models/switch_transformers/modular_switch_transformers.py +41 -34
transformers/models/t5/configuration_t5.py +8 -14
transformers/models/t5/modeling_t5.py +92 -88
transformers/models/t5/tokenization_t5.py +9 -3
transformers/models/t5gemma/configuration_t5gemma.py +41 -43
transformers/models/t5gemma/modeling_t5gemma.py +107 -104
transformers/models/t5gemma/modular_t5gemma.py +120 -124
transformers/models/t5gemma2/configuration_t5gemma2.py +120 -80
transformers/models/t5gemma2/modeling_t5gemma2.py +125 -141
transformers/models/t5gemma2/modular_t5gemma2.py +104 -393
transformers/models/table_transformer/configuration_table_transformer.py +2 -1
transformers/models/table_transformer/modeling_table_transformer.py +49 -51
transformers/models/tapas/configuration_tapas.py +2 -12
transformers/models/tapas/modeling_tapas.py +67 -68
transformers/models/tapas/tokenization_tapas.py +153 -115
transformers/models/textnet/configuration_textnet.py +1 -0
transformers/models/textnet/image_processing_textnet.py +25 -22
transformers/models/textnet/image_processing_textnet_fast.py +10 -8
transformers/models/textnet/modeling_textnet.py +16 -28
transformers/models/time_series_transformer/configuration_time_series_transformer.py +8 -5
transformers/models/time_series_transformer/modeling_time_series_transformer.py +81 -83
transformers/models/timesfm/configuration_timesfm.py +1 -0
transformers/models/timesfm/modeling_timesfm.py +22 -33
transformers/models/timesfm/modular_timesfm.py +21 -32
transformers/models/timesformer/configuration_timesformer.py +1 -0
transformers/models/timesformer/modeling_timesformer.py +16 -15
transformers/models/timm_backbone/configuration_timm_backbone.py +1 -0
transformers/models/timm_backbone/modeling_timm_backbone.py +15 -17
transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -5
transformers/models/timm_wrapper/image_processing_timm_wrapper.py +5 -4
transformers/models/timm_wrapper/modeling_timm_wrapper.py +29 -34
transformers/models/trocr/configuration_trocr.py +8 -11
transformers/models/trocr/modeling_trocr.py +44 -45
transformers/models/trocr/processing_trocr.py +25 -5
transformers/models/tvp/configuration_tvp.py +2 -5
transformers/models/tvp/image_processing_tvp.py +52 -50
transformers/models/tvp/image_processing_tvp_fast.py +15 -15
transformers/models/tvp/modeling_tvp.py +27 -27
transformers/models/tvp/processing_tvp.py +14 -2
transformers/models/udop/configuration_udop.py +7 -16
transformers/models/udop/modeling_udop.py +73 -71
transformers/models/udop/processing_udop.py +26 -7
transformers/models/udop/tokenization_udop.py +105 -84
transformers/models/umt5/configuration_umt5.py +7 -8
transformers/models/umt5/modeling_umt5.py +90 -94
transformers/models/unispeech/configuration_unispeech.py +2 -4
transformers/models/unispeech/modeling_unispeech.py +49 -51
transformers/models/unispeech/modular_unispeech.py +22 -22
transformers/models/unispeech_sat/configuration_unispeech_sat.py +2 -4
transformers/models/unispeech_sat/modeling_unispeech_sat.py +65 -69
transformers/models/unispeech_sat/modular_unispeech_sat.py +23 -23
transformers/models/univnet/feature_extraction_univnet.py +14 -14
transformers/models/univnet/modeling_univnet.py +8 -8
transformers/models/upernet/configuration_upernet.py +1 -0
transformers/models/upernet/modeling_upernet.py +13 -11
transformers/models/vaultgemma/__init__.py +1 -0
transformers/models/vaultgemma/configuration_vaultgemma.py +33 -29
transformers/models/vaultgemma/modeling_vaultgemma.py +41 -39
transformers/models/vaultgemma/modular_vaultgemma.py +31 -29
transformers/models/video_llama_3/configuration_video_llama_3.py +0 -4
transformers/models/video_llama_3/image_processing_video_llama_3.py +42 -43
transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +14 -12
transformers/models/video_llama_3/modeling_video_llama_3.py +109 -157
transformers/models/video_llama_3/modular_video_llama_3.py +146 -155
transformers/models/video_llama_3/processing_video_llama_3.py +39 -5
transformers/models/video_llama_3/video_processing_video_llama_3.py +23 -42
transformers/models/video_llava/configuration_video_llava.py +1 -4
transformers/models/video_llava/image_processing_video_llava.py +38 -35
transformers/models/video_llava/modeling_video_llava.py +146 -146
transformers/models/video_llava/processing_video_llava.py +78 -38
transformers/models/video_llava/video_processing_video_llava.py +1 -0
transformers/models/videomae/configuration_videomae.py +1 -0
transformers/models/videomae/image_processing_videomae.py +34 -31
transformers/models/videomae/modeling_videomae.py +17 -14
transformers/models/videomae/video_processing_videomae.py +1 -0
transformers/models/vilt/configuration_vilt.py +4 -6
transformers/models/vilt/image_processing_vilt.py +30 -29
transformers/models/vilt/image_processing_vilt_fast.py +16 -15
transformers/models/vilt/modeling_vilt.py +90 -116
transformers/models/vilt/processing_vilt.py +14 -2
transformers/models/vipllava/configuration_vipllava.py +1 -4
transformers/models/vipllava/modeling_vipllava.py +70 -99
transformers/models/vipllava/modular_vipllava.py +54 -78
transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +1 -0
transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +27 -28
transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +1 -0
transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +41 -46
transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +16 -2
transformers/models/visual_bert/configuration_visual_bert.py +2 -6
transformers/models/visual_bert/modeling_visual_bert.py +92 -98
transformers/models/vit/configuration_vit.py +1 -0
transformers/models/vit/image_processing_vit.py +22 -19
transformers/models/vit/image_processing_vit_fast.py +1 -0
transformers/models/vit/modeling_vit.py +17 -17
transformers/models/vit_mae/configuration_vit_mae.py +1 -0
transformers/models/vit_mae/modeling_vit_mae.py +27 -29
transformers/models/vit_msn/configuration_vit_msn.py +1 -0
transformers/models/vit_msn/modeling_vit_msn.py +16 -18
transformers/models/vitdet/configuration_vitdet.py +1 -0
transformers/models/vitdet/modeling_vitdet.py +14 -14
transformers/models/vitmatte/configuration_vitmatte.py +5 -2
transformers/models/vitmatte/image_processing_vitmatte.py +18 -15
transformers/models/vitmatte/image_processing_vitmatte_fast.py +18 -16
transformers/models/vitmatte/modeling_vitmatte.py +11 -14
transformers/models/vitpose/configuration_vitpose.py +7 -4
transformers/models/vitpose/image_processing_vitpose.py +25 -24
transformers/models/vitpose/image_processing_vitpose_fast.py +11 -9
transformers/models/vitpose/modeling_vitpose.py +14 -14
transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +1 -0
transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +10 -8
transformers/models/vits/configuration_vits.py +1 -4
transformers/models/vits/modeling_vits.py +42 -44
transformers/models/vits/tokenization_vits.py +4 -3
transformers/models/vivit/configuration_vivit.py +1 -0
transformers/models/vivit/image_processing_vivit.py +39 -36
transformers/models/vivit/modeling_vivit.py +8 -6
transformers/models/vjepa2/__init__.py +1 -0
transformers/models/vjepa2/configuration_vjepa2.py +1 -0
transformers/models/vjepa2/modeling_vjepa2.py +32 -31
transformers/models/vjepa2/video_processing_vjepa2.py +1 -0
transformers/models/voxtral/__init__.py +1 -0
transformers/models/voxtral/configuration_voxtral.py +2 -0
transformers/models/voxtral/modeling_voxtral.py +47 -40
transformers/models/voxtral/modular_voxtral.py +40 -37
transformers/models/voxtral/processing_voxtral.py +48 -25
transformers/models/wav2vec2/configuration_wav2vec2.py +2 -4
transformers/models/wav2vec2/feature_extraction_wav2vec2.py +10 -7
transformers/models/wav2vec2/modeling_wav2vec2.py +121 -73
transformers/models/wav2vec2/processing_wav2vec2.py +35 -6
transformers/models/wav2vec2/tokenization_wav2vec2.py +332 -20
transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +2 -4
transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +62 -70
transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +48 -57
transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +35 -6
transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +2 -4
transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +77 -90
transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +30 -37
transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +17 -16
transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +55 -36
transformers/models/wavlm/configuration_wavlm.py +2 -4
transformers/models/wavlm/modeling_wavlm.py +48 -50
transformers/models/wavlm/modular_wavlm.py +5 -4
transformers/models/whisper/configuration_whisper.py +5 -6
transformers/models/whisper/english_normalizer.py +4 -3
transformers/models/whisper/feature_extraction_whisper.py +24 -9
transformers/models/whisper/generation_whisper.py +48 -26
transformers/models/whisper/modeling_whisper.py +73 -79
transformers/models/whisper/processing_whisper.py +20 -3
transformers/models/whisper/tokenization_whisper.py +43 -11
transformers/models/x_clip/configuration_x_clip.py +2 -4
transformers/models/x_clip/modeling_x_clip.py +93 -96
transformers/models/x_clip/processing_x_clip.py +14 -2
transformers/models/xcodec/configuration_xcodec.py +6 -4
transformers/models/xcodec/modeling_xcodec.py +17 -20
transformers/models/xglm/configuration_xglm.py +8 -9
transformers/models/xglm/modeling_xglm.py +55 -60
transformers/models/xglm/tokenization_xglm.py +11 -3
transformers/models/xlm/configuration_xlm.py +8 -10
transformers/models/xlm/modeling_xlm.py +144 -144
transformers/models/xlm/tokenization_xlm.py +5 -3
transformers/models/xlm_roberta/configuration_xlm_roberta.py +3 -11
transformers/models/xlm_roberta/modeling_xlm_roberta.py +194 -195
transformers/models/xlm_roberta/modular_xlm_roberta.py +53 -50
transformers/models/xlm_roberta/tokenization_xlm_roberta.py +18 -8
transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +2 -10
transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +93 -94
transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +70 -67
transformers/models/xlnet/configuration_xlnet.py +12 -3
transformers/models/xlnet/modeling_xlnet.py +163 -152
transformers/models/xlnet/tokenization_xlnet.py +9 -2
transformers/models/xlstm/configuration_xlstm.py +12 -8
transformers/models/xlstm/modeling_xlstm.py +65 -62
transformers/models/xmod/configuration_xmod.py +3 -11
transformers/models/xmod/modeling_xmod.py +110 -108
transformers/models/yolos/configuration_yolos.py +1 -0
transformers/models/yolos/image_processing_yolos.py +62 -60
transformers/models/yolos/image_processing_yolos_fast.py +45 -42
transformers/models/yolos/modeling_yolos.py +16 -16
transformers/models/yolos/modular_yolos.py +19 -17
transformers/models/yoso/configuration_yoso.py +2 -8
transformers/models/yoso/modeling_yoso.py +63 -70
transformers/models/zamba/configuration_zamba.py +8 -5
transformers/models/zamba/modeling_zamba.py +78 -81
transformers/models/zamba2/configuration_zamba2.py +50 -44
transformers/models/zamba2/modeling_zamba2.py +97 -97
transformers/models/zamba2/modular_zamba2.py +48 -46
transformers/models/zoedepth/configuration_zoedepth.py +2 -1
transformers/models/zoedepth/image_processing_zoedepth.py +29 -28
transformers/models/zoedepth/image_processing_zoedepth_fast.py +24 -21
transformers/models/zoedepth/modeling_zoedepth.py +18 -26
transformers/pipelines/__init__.py +114 -57
transformers/pipelines/any_to_any.py +22 -14
transformers/pipelines/audio_utils.py +2 -1
transformers/pipelines/automatic_speech_recognition.py +12 -20
transformers/pipelines/base.py +27 -15
transformers/{models/pe_audio/processing_pe_audio.py → pipelines/deprecated/__init__.py} +3 -10
transformers/pipelines/deprecated/text2text_generation.py +408 -0
transformers/pipelines/document_question_answering.py +2 -4
transformers/pipelines/image_text_to_text.py +1 -0
transformers/pipelines/image_to_text.py +229 -0
transformers/pipelines/question_answering.py +44 -5
transformers/pipelines/text_classification.py +14 -1
transformers/pipelines/text_generation.py +1 -1
transformers/pipelines/text_to_audio.py +2 -2
transformers/pipelines/token_classification.py +22 -1
transformers/pipelines/video_classification.py +9 -1
transformers/pipelines/zero_shot_audio_classification.py +1 -0
transformers/pipelines/zero_shot_classification.py +6 -0
transformers/pipelines/zero_shot_image_classification.py +7 -0
transformers/processing_utils.py +145 -230
transformers/quantizers/auto.py +4 -2
transformers/quantizers/base.py +173 -53
transformers/quantizers/quantizer_aqlm.py +23 -2
transformers/quantizers/quantizer_auto_round.py +12 -2
transformers/quantizers/quantizer_awq.py +89 -20
transformers/quantizers/quantizer_bitnet.py +14 -4
transformers/quantizers/quantizer_bnb_4bit.py +155 -18
transformers/quantizers/quantizer_bnb_8bit.py +110 -24
transformers/quantizers/quantizer_compressed_tensors.py +9 -2
transformers/quantizers/quantizer_eetq.py +74 -16
transformers/quantizers/quantizer_fbgemm_fp8.py +138 -38
transformers/quantizers/quantizer_finegrained_fp8.py +113 -26
transformers/quantizers/quantizer_fp_quant.py +82 -52
transformers/quantizers/quantizer_gptq.py +28 -8
transformers/quantizers/quantizer_higgs.py +60 -42
transformers/quantizers/quantizer_hqq.py +153 -144
transformers/quantizers/quantizer_mxfp4.py +194 -14
transformers/quantizers/quantizer_quanto.py +79 -35
transformers/quantizers/quantizer_quark.py +18 -36
transformers/quantizers/quantizer_spqr.py +12 -4
transformers/quantizers/quantizer_torchao.py +325 -50
transformers/quantizers/quantizer_vptq.py +27 -4
transformers/quantizers/quantizers_utils.py +0 -20
transformers/safetensors_conversion.py +3 -9
transformers/testing_utils.py +82 -326
transformers/tokenization_mistral_common.py +903 -568
transformers/tokenization_utils_base.py +340 -220
transformers/tokenization_utils_sentencepiece.py +6 -5
transformers/tokenization_utils_tokenizers.py +113 -226
transformers/trainer.py +53 -60
transformers/trainer_callback.py +0 -8
transformers/trainer_seq2seq.py +1 -5
transformers/trainer_utils.py +1 -1
transformers/training_args.py +41 -77
transformers/utils/__init__.py +4 -8
transformers/utils/attention_visualizer.py +5 -5
transformers/utils/auto_docstring.py +37 -599
transformers/utils/doc.py +36 -4
transformers/utils/dummy_pt_objects.py +42 -0
transformers/utils/generic.py +28 -111
transformers/utils/hub.py +15 -5
transformers/utils/import_utils.py +32 -165
transformers/utils/kernel_config.py +19 -74
transformers/utils/loading_report.py +15 -25
transformers/utils/quantization_config.py +241 -72
transformers/video_processing_utils.py +39 -41
transformers/video_utils.py +22 -18
{transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/METADATA +236 -284
transformers-5.0.0rc0.dist-info/RECORD +1987 -0
{transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/WHEEL +1 -1
transformers/integrations/moe.py +0 -360
transformers/integrations/quark.py +0 -53
transformers/loss/loss_lw_detr.py +0 -356
transformers/models/ernie4_5_vl_moe/__init__.py +0 -31
transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +0 -340
transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +0 -455
transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +0 -231
transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +0 -1936
transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +0 -1925
transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +0 -249
transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +0 -593
transformers/models/fast_vlm/__init__.py +0 -27
transformers/models/fast_vlm/configuration_fast_vlm.py +0 -137
transformers/models/fast_vlm/modeling_fast_vlm.py +0 -432
transformers/models/fast_vlm/modular_fast_vlm.py +0 -373
transformers/models/glm4_moe_lite/__init__.py +0 -28
transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +0 -233
transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +0 -740
transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +0 -302
transformers/models/glm_image/__init__.py +0 -31
transformers/models/glm_image/configuration_glm_image.py +0 -351
transformers/models/glm_image/image_processing_glm_image.py +0 -503
transformers/models/glm_image/image_processing_glm_image_fast.py +0 -294
transformers/models/glm_image/modeling_glm_image.py +0 -1642
transformers/models/glm_image/modular_glm_image.py +0 -1531
transformers/models/glm_image/processing_glm_image.py +0 -217
transformers/models/glmasr/__init__.py +0 -29
transformers/models/glmasr/configuration_glmasr.py +0 -196
transformers/models/glmasr/modeling_glmasr.py +0 -517
transformers/models/glmasr/modular_glmasr.py +0 -443
transformers/models/glmasr/processing_glmasr.py +0 -331
transformers/models/jais2/__init__.py +0 -27
transformers/models/jais2/configuration_jais2.py +0 -148
transformers/models/jais2/modeling_jais2.py +0 -484
transformers/models/jais2/modular_jais2.py +0 -194
transformers/models/lasr/__init__.py +0 -29
transformers/models/lasr/configuration_lasr.py +0 -244
transformers/models/lasr/feature_extraction_lasr.py +0 -275
transformers/models/lasr/modeling_lasr.py +0 -727
transformers/models/lasr/modular_lasr.py +0 -574
transformers/models/lasr/processing_lasr.py +0 -100
transformers/models/lasr/tokenization_lasr.py +0 -184
transformers/models/lighton_ocr/__init__.py +0 -28
transformers/models/lighton_ocr/configuration_lighton_ocr.py +0 -128
transformers/models/lighton_ocr/modeling_lighton_ocr.py +0 -463
transformers/models/lighton_ocr/modular_lighton_ocr.py +0 -404
transformers/models/lighton_ocr/processing_lighton_ocr.py +0 -229
transformers/models/lw_detr/__init__.py +0 -27
transformers/models/lw_detr/configuration_lw_detr.py +0 -374
transformers/models/lw_detr/modeling_lw_detr.py +0 -1702
transformers/models/lw_detr/modular_lw_detr.py +0 -1615
transformers/models/minimax_m2/__init__.py +0 -28
transformers/models/minimax_m2/configuration_minimax_m2.py +0 -188
transformers/models/minimax_m2/modeling_minimax_m2.py +0 -704
transformers/models/minimax_m2/modular_minimax_m2.py +0 -346
transformers/models/paddleocr_vl/__init__.py +0 -31
transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +0 -335
transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +0 -503
transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +0 -209
transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +0 -1683
transformers/models/paddleocr_vl/modular_paddleocr_vl.py +0 -1380
transformers/models/paddleocr_vl/processing_paddleocr_vl.py +0 -133
transformers/models/pe_audio/__init__.py +0 -29
transformers/models/pe_audio/configuration_pe_audio.py +0 -204
transformers/models/pe_audio/feature_extraction_pe_audio.py +0 -160
transformers/models/pe_audio/modeling_pe_audio.py +0 -819
transformers/models/pe_audio/modular_pe_audio.py +0 -298
transformers/models/pe_audio_video/__init__.py +0 -28
transformers/models/pe_audio_video/configuration_pe_audio_video.py +0 -223
transformers/models/pe_audio_video/modeling_pe_audio_video.py +0 -971
transformers/models/pe_audio_video/modular_pe_audio_video.py +0 -763
transformers/models/pe_video/__init__.py +0 -29
transformers/models/pe_video/configuration_pe_video.py +0 -209
transformers/models/pe_video/modeling_pe_video.py +0 -647
transformers/models/pe_video/modular_pe_video.py +0 -231
transformers/models/pe_video/processing_pe_video.py +0 -10
transformers/models/pe_video/video_processing_pe_video.py +0 -64
transformers/models/pixio/__init__.py +0 -29
transformers/models/pixio/configuration_pixio.py +0 -150
transformers/models/pixio/modeling_pixio.py +0 -507
transformers/models/pixio/modular_pixio.py +0 -403
transformers/models/solar_open/__init__.py +0 -27
transformers/models/solar_open/configuration_solar_open.py +0 -184
transformers/models/solar_open/modeling_solar_open.py +0 -642
transformers/models/solar_open/modular_solar_open.py +0 -224
transformers/trainer_jit_checkpoint.py +0 -125
transformers-5.0.0.dist-info/RECORD +0 -2068
{transformers-5.0.0.dist-info/licenses → transformers-5.0.0rc0.dist-info}/LICENSE +0 -0
{transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/entry_points.txt +0 -0
{transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/top_level.txt +0 -0

transformers/modeling_utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
@@ -15,6 +16,7 @@
 import collections
 import copy
 import functools
+import gc
 import importlib.metadata
 import inspect
 import json
@@ -24,18 +26,17 @@ import sys
 import warnings
 from abc import abstractmethod
 from collections import defaultdict
-from collections.abc import Callable, Iterator
+from collections.abc import Callable, Sequence
 from contextlib import contextmanager
-from dataclasses import dataclass, field, replace
 from enum import Enum
 from functools import partial, wraps
 from itertools import cycle
 from threading import Thread
-from typing import Optional, TypeVar, get_type_hints
+from typing import Optional, TypeVar, Union, get_type_hints
 from zipfile import is_zipfile
 import torch
-from huggingface_hub import create_repo, is_offline_mode, split_torch_state_dict_into_shards
+from huggingface_hub import create_repo, split_torch_state_dict_into_shards
 from packaging import version
 from safetensors import safe_open
 from safetensors.torch import save_file as safe_save_file
@@ -62,8 +63,7 @@ from .integrations.accelerate import (
     accelerate_dispatch,
     check_and_set_device_map,
     expand_device_map,
-    get_device,
-    load_offloaded_parameter,
+    init_empty_weights,
 )
 from .integrations.deepspeed import _load_state_dict_into_zero3_model
 from .integrations.eager_paged import eager_paged_attention_forward
@@ -85,8 +85,7 @@ from .integrations.tensor_parallel import (
     verify_tp_plan,
 )
 from .loss.loss_utils import LOSS_MAPPING
-from .modeling_flash_attention_utils import lazy_import_flash_attention, lazy_import_paged_flash_attention
-from .modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from .modeling_flash_attention_utils import lazy_import_flash_attention
 from .pytorch_utils import id_tensor_storage
 from .quantizers import HfQuantizer
 from .quantizers.auto import get_hf_quantizer
@@ -94,6 +93,7 @@ from .quantizers.quantizers_utils import get_module_from_name
 from .safetensors_conversion import auto_conversion
 from .utils import (
     ADAPTER_SAFE_WEIGHTS_NAME,
+    ADAPTER_WEIGHTS_NAME,
     DUMMY_INPUTS,
     SAFE_WEIGHTS_INDEX_NAME,
     SAFE_WEIGHTS_NAME,
@@ -107,12 +107,10 @@ from .utils import (
     copy_func,
     has_file,
     is_accelerate_available,
-    is_bitsandbytes_available,
-    is_env_variable_true,
     is_flash_attn_2_available,
     is_flash_attn_3_available,
-    is_grouped_mm_available,
     is_kernels_available,
+    is_offline_mode,
     is_torch_flex_attn_available,
     is_torch_greater_or_equal,
     is_torch_mlu_available,
@@ -120,7 +118,7 @@ from .utils import (
     is_torch_xpu_available,
     logging,
 )
-from .utils.generic import _CAN_RECORD_REGISTRY, GeneralInterface, OutputRecorder, is_flash_attention_requested
+from .utils.generic import _CAN_RECORD_REGISTRY, GeneralInterface, OutputRecorder
 from .utils.hub import DownloadKwargs, create_and_tag_model_card, get_checkpoint_shard_files
 from .utils.import_utils import (
     is_huggingface_hub_greater_or_equal,
@@ -134,6 +132,7 @@ from .utils.quantization_config import QuantizationMethod
 if is_accelerate_available():
     from accelerate.hooks import add_hook_to_module
     from accelerate.utils import extract_model_from_parallel
+    from accelerate.utils.modeling import get_state_dict_from_offload
 _torch_distributed_available = torch.distributed.is_available()
@@ -155,63 +154,62 @@ logger = logging.get_logger(__name__)
 XLA_USE_BF16 = os.environ.get("XLA_USE_BF16", "0").upper()
 XLA_DOWNCAST_BF16 = os.environ.get("XLA_DOWNCAST_BF16", "0").upper()
 SpecificPreTrainedModelType = TypeVar("SpecificPreTrainedModelType", bound="PreTrainedModel")
+_init_weights = True
 _is_quantized = False
 _is_ds_init_called = False
-# Mapping from flash attention implementations to their kernel fallback repositories
-FLASH_ATTN_KERNEL_FALLBACK = {
-    "flash_attention_2": "kernels-community/flash-attn2",
-    "flash_attention_3": "kernels-community/vllm-flash-attn3",
-}
-@dataclass(frozen=True)
-class LoadStateDictConfig:
-    """
-    Config for loading weights. This allows bundling arguments that are just
-    passed around.
-    """
+def is_local_dist_rank_0():
+    return (
+        torch.distributed.is_available()
+        and torch.distributed.is_initialized()
+        and int(os.environ.get("LOCAL_RANK", "-1")) == 0
+    )
-    pretrained_model_name_or_path: str | None = None
-    download_kwargs: DownloadKwargs | None = field(default_factory=DownloadKwargs)
-    use_safetensors: bool = True
-    ignore_mismatched_sizes: bool = False
-    sharded_metadata: dict | None = None
-    device_map: dict | None = None
-    disk_offload_folder: str | None = None
-    offload_buffers: bool = False
-    dtype: torch.dtype | None = None
-    hf_quantizer: HfQuantizer | None = None
-    device_mesh: Optional["torch.distributed.device_mesh.DeviceMesh"] = None
-    weights_only: bool = True
-    weight_mapping: list[WeightConverter | WeightRenaming] | None = None
-    @property
-    def is_quantized(self) -> bool:
-        return self.hf_quantizer is not None
+TORCH_INIT_FUNCTIONS = {
+    "uniform_": nn.init.uniform_,
+    "normal_": nn.init.normal_,
+    "trunc_normal_": nn.init.trunc_normal_,
+    "constant_": nn.init.constant_,
+    "xavier_uniform_": nn.init.xavier_uniform_,
+    "xavier_normal_": nn.init.xavier_normal_,
+    "kaiming_uniform_": nn.init.kaiming_uniform_,
+    "kaiming_normal_": nn.init.kaiming_normal_,
+    "uniform": nn.init.uniform,
+    "normal": nn.init.normal,
+    "xavier_uniform": nn.init.xavier_uniform,
+    "xavier_normal": nn.init.xavier_normal,
+    "kaiming_uniform": nn.init.kaiming_uniform,
+    "kaiming_normal": nn.init.kaiming_normal,
+    "orthogonal_": nn.init.orthogonal_,
+}
-@dataclass
-class LoadStateDictInfo:
+@contextmanager
+def no_init_weights():
     """
-    Return container for state-dict loading results and diagnostics.
-    This simplifies the code a bit.
+    Context manager to globally disable weight initialization to speed up loading large models.
     """
+    global _init_weights
+    old_init_weights = _init_weights
-    missing_keys: set[str]
-    unexpected_keys: set[str]
-    mismatched_keys: set[tuple[str, torch.Size]]
-    disk_offload_index: dict[str, str] | None
-    error_msgs: list[str]
-    conversion_errors: set[str]
+    _init_weights = False
+    def _skip_init(*args, **kwargs):
+        pass
-def is_local_dist_rank_0():
-    return (
-        torch.distributed.is_available()
-        and torch.distributed.is_initialized()
-        and int(os.environ.get("LOCAL_RANK", "-1")) == 0
-    )
+    # Save the original initialization functions
+    for name, init_func in TORCH_INIT_FUNCTIONS.items():
+        setattr(torch.nn.init, name, _skip_init)
+    try:
+        yield
+    finally:
+        _init_weights = old_init_weights
+        # Restore the original initialization functions
+        for name, init_func in TORCH_INIT_FUNCTIONS.items():
+            setattr(torch.nn.init, name, init_func)
 @contextmanager
@@ -237,28 +235,23 @@ def set_zero3_state():
         _is_ds_init_called = False
-@contextmanager
-def local_torch_dtype(dtype: torch.dtype, model_class_name: str | None = None):
+def restore_default_dtype(func):
     """
-    Locally change the torch default dtype to `dtype`, and restore the old one upon exiting the context.
-    If `model_class_name` is provided, it's used to provide a more helpful error message if `dtype` is not valid.
+    Decorator to restore the default torch dtype
+    at the end of the function. Serves
+    as a backup in case calling the function raises
+    an error after the function has changed the default dtype but before it could restore it.
     """
-    # Just a more helping error before we set `torch.set_default_dtype` later on which would crash in this case
-    if not dtype.is_floating_point:
-        if model_class_name is not None:
-            error_message = (
-                f"{model_class_name} cannot be instantiated under `dtype={dtype}` as it's not a floating-point dtype"
-            )
-        else:
-            error_message = f"Cannot set `{dtype}` as torch's default as it's not a floating-point dtype"
-        raise ValueError(error_message)
-    original_dtype = torch.get_default_dtype()
-    try:
-        torch.set_default_dtype(dtype)
-        yield
-    finally:
-        torch.set_default_dtype(original_dtype)
+    @wraps(func)
+    def _wrapper(*args, **kwargs):
+        old_dtype = torch.get_default_dtype()
+        try:
+            return func(*args, **kwargs)
+        finally:
+            torch.set_default_dtype(old_dtype)
+    return _wrapper
 def get_torch_context_manager_or_global_device():
@@ -286,9 +279,7 @@ def get_state_dict_dtype(state_dict):
             return t.dtype
     # if no floating dtype was found return whatever the first dtype is
-    if len(state_dict) == 0:
-        return torch.float32
-    return next(iter(state_dict.values())).dtype
+    return next(state_dict.values()).dtype
 str_to_torch_dtype = {
@@ -314,7 +305,7 @@ if is_torch_greater_or_equal("2.3.0"):
 def load_state_dict(
-    checkpoint_file: str | os.PathLike, map_location: str | torch.device = "cpu", weights_only: bool = True
+    checkpoint_file: Union[str, os.PathLike], map_location: Union[str, torch.device] = "cpu", weights_only: bool = True
 ) -> dict[str, torch.Tensor]:
     """
     Reads a `safetensor` or a `.bin` checkpoint file. We load the checkpoint on "cpu" by default.
@@ -414,97 +405,14 @@ def _find_identical(tensors: list[set[str]], state_dict: dict[str, torch.Tensor]
     return shared_tensors, identical
-def remove_tied_weights_from_state_dict(
-    state_dict: dict[str, torch.Tensor], model: "PreTrainedModel"
-) -> dict[str, torch.Tensor]:
-    """
-    Remove all tied weights from the given `state_dict`, making sure to keep only the main weight that `model`
-    will expect when reloading (even if we know tie weights symmetrically, it's better to keep the intended one).
-    This is because `safetensors` does not allow tensor aliasing - so we're going to remove aliases before saving.
-    """
-    # To avoid any potential mistakes and mismatches between config and actual tied weights, here we check the pointers
-    # of the Tensors themselves -> we are guaranteed to find all the actual tied weights
-    ptrs = collections.defaultdict(list)
-    for name, tensor in state_dict.items():
-        if not isinstance(tensor, torch.Tensor):
-            # Sometimes in the state_dict we have non-tensor objects.
-            # e.g. in bitsandbytes we have some `str` objects in the state_dict
-            # In the non-tensor case, fall back to the pointer of the object itself
-            ptrs[id(tensor)].append(name)
-        elif tensor.device.type == "meta":
-            # In offloaded cases, there may be meta tensors in the state_dict.
-            # For these cases, key by the pointer of the original tensor object
-            # (state_dict tensors are detached and therefore no longer shared)
-            tensor = model.get_parameter(name)
-            ptrs[id(tensor)].append(name)
-        else:
-            ptrs[id_tensor_storage(tensor)].append(name)
-    shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
-    # Recursively descend to find tied weight keys
-    all_potential_tied_weights_keys = set(_get_tied_weight_keys(model))
-    error_names = []
-    to_delete_names = set()
-    # Removing the keys which are declared as known duplicates on load. This allows to make sure the name which is
-    # kept is consistent
-    if all_potential_tied_weights_keys is not None:
-        for names in shared_ptrs.values():
-            found = 0
-            for name in sorted(names):
-                matches_pattern = any(re.search(pat, name) for pat in all_potential_tied_weights_keys)
-                if matches_pattern and name in state_dict:
-                    found += 1
-                    if found < len(names):
-                        to_delete_names.add(name)
-    # We are entering a place where the weights and the transformers configuration do NOT match.
-    shared_names, disjoint_names = _find_disjoint(shared_ptrs.values(), state_dict)
-    # Those are actually tensor sharing but disjoint from each other, we can safely clone them
-    # Reloaded won't have the same property, but it shouldn't matter in any meaningful way.
-    for name in disjoint_names:
-        state_dict[name] = state_dict[name].clone()
-    # When not all duplicates have been cleaned, still remove those keys, but put a clear warning.
-    # If the link between tensors was done at runtime then `from_pretrained` will not get
-    # the key back leading to random tensor. A proper warning will be shown
-    # during reload (if applicable), but since the file is not necessarily compatible with
-    # the config, better show a proper warning.
-    shared_names, identical_names = _find_identical(shared_names, state_dict)
-    # delete tensors that have identical storage
-    for inames in identical_names:
-        known = inames.intersection(to_delete_names)
-        for name in known:
-            del state_dict[name]
-        unknown = inames.difference(to_delete_names)
-        if len(unknown) > 1:
-            error_names.append(unknown)
-    if shared_names:
-        error_names.extend(shared_names)
-    if len(error_names) > 0:
-        raise RuntimeError(
-            f"The weights trying to be saved contained shared tensors {error_names} which are not properly defined. "
-            f"We found all the potential target tied weights keys to be: {all_potential_tied_weights_keys}.\n"
-            "This can also just mean that the module's tied weight keys are wrong vs the actual tied weights in the model.",
-        )
-    return state_dict
 def _load_parameter_into_model(model: "PreTrainedModel", param_name: str, tensor: torch.Tensor):
-    """Cast a single parameter or buffer `param_name` into the `model`, with value `tensor`."""
-    parent, param_type = get_module_from_name(model, param_name)
-    if param_type in parent._parameters and not isinstance(tensor, nn.Parameter):
-        tensor = nn.Parameter(tensor, requires_grad=tensor.is_floating_point())
-    # We need to use setattr here, as we set non-persistent buffers as well with this function (`load_state_dict`
-    # does not allow to do it)
-    setattr(parent, param_type, tensor)
+    """Cast a single parameter `param_name` into the `model`, with value `tensor`."""
+    module, param_type = get_module_from_name(model, param_name)
+    # This will check potential shape mismatch if skipped before
+    module.load_state_dict({param_type: tensor}, strict=False, assign=True)
-def _add_variant(weights_name: str, variant: str | None = None) -> str:
+def _add_variant(weights_name: str, variant: Optional[str] = None) -> str:
     if variant is not None:
         path, name = weights_name.rsplit(".", 1)
         weights_name = f"{path}.{variant}.{name}"
@@ -512,20 +420,19 @@ def _add_variant(weights_name: str, variant: str | None = None) -> str:
 def _get_resolved_checkpoint_files(
-    pretrained_model_name_or_path: str | os.PathLike | None,
-    variant: str | None,
-    gguf_file: str | None,
-    use_safetensors: bool | None,
-    user_agent: dict | None,
+    pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+    variant: Optional[str],
+    gguf_file: Optional[str],
+    use_safetensors: Optional[bool],
+    download_kwargs: DownloadKwargs,
+    user_agent: dict,
     is_remote_code: bool,  # Because we can't determine this inside this function, we need it to be passed in
-    transformers_explicit_filename: str | None = None,
-    download_kwargs: DownloadKwargs | None = None,
-) -> tuple[list[str] | None, dict | None]:
+    transformers_explicit_filename: Optional[str] = None,
+) -> tuple[Optional[list[str]], Optional[dict]]:
     """Get all the checkpoint filenames based on `pretrained_model_name_or_path`, and optional metadata if the
     checkpoints are sharded.
     This function will download the data if necessary.
     """
-    download_kwargs = download_kwargs or DownloadKwargs()
     cache_dir = download_kwargs.get("cache_dir")
     force_download = download_kwargs.get("force_download", False)
     proxies = download_kwargs.get("proxies")
@@ -538,19 +445,17 @@ def _get_resolved_checkpoint_files(
         if not transformers_explicit_filename.endswith(".safetensors") and not transformers_explicit_filename.endswith(
             ".safetensors.index.json"
         ):
-            if transformers_explicit_filename != "adapter_model.bin":
-                raise ValueError(
-                    "The transformers file in the config seems to be incorrect: it is neither a safetensors file "
-                    "(*.safetensors) nor a safetensors index file (*.safetensors.index.json): "
-                    f"{transformers_explicit_filename}"
-                )
+            raise ValueError(
+                "The transformers file in the config seems to be incorrect: it is neither a safetensors file "
+                "(*.safetensors) nor a safetensors index file (*.safetensors.index.json): "
+                f"{transformers_explicit_filename}"
+            )
     is_sharded = False
     if pretrained_model_name_or_path is not None and gguf_file is None:
         pretrained_model_name_or_path = str(pretrained_model_name_or_path)
         is_local = os.path.isdir(pretrained_model_name_or_path)
-        # If the file is a local folder (but not in the HF_HOME cache, even if it's technically local)
         if is_local:
             if transformers_explicit_filename is not None:
                 # If the filename is explicitly defined, load this by default.
@@ -609,38 +514,25 @@ def _get_resolved_checkpoint_files(
             else:
                 filename = _add_variant(WEIGHTS_NAME, variant)
-            # Prepare set of kwargs for hub functions
-            has_file_kwargs = {
-                "revision": revision,
-                "proxies": proxies,
-                "token": token,
-                "cache_dir": cache_dir,
-                "local_files_only": local_files_only,
-            }
-            cached_file_kwargs = {
-                "force_download": force_download,
-                "user_agent": user_agent,
-                "subfolder": subfolder,
-                "_raise_exceptions_for_gated_repo": False,
-                "_raise_exceptions_for_missing_entries": False,
-                "_commit_hash": commit_hash,
-                **has_file_kwargs,
-            }
-            can_auto_convert = (
-                not is_offline_mode()  # for obvious reasons
-                # If we are in a CI environment or in a pytest run, we prevent the conversion
-                and not is_env_variable_true("DISABLE_SAFETENSORS_CONVERSION")
-                and not is_remote_code  # converter bot does not work on remote code
-                and subfolder == ""  # converter bot does not work on subfolders
-            )
             try:
                 # Load from URL or cache if already cached
-                # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None
-                # result when internet is up, the repo and revision exist, but the file does not.
+                cached_file_kwargs = {
+                    "cache_dir": cache_dir,
+                    "force_download": force_download,
+                    "proxies": proxies,
+                    "local_files_only": local_files_only,
+                    "token": token,
+                    "user_agent": user_agent,
+                    "revision": revision,
+                    "subfolder": subfolder,
+                    "_raise_exceptions_for_gated_repo": False,
+                    "_raise_exceptions_for_missing_entries": False,
+                    "_commit_hash": commit_hash,
+                }
                 resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
-                # Try safetensors files first if not already found
+                # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None
+                # result when internet is up, the repo and revision exist, but the file does not.
                 if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant):
                     # Maybe the checkpoint is sharded, we try to grab the index name in this case.
                     resolved_archive_file = cached_file(
@@ -651,7 +543,7 @@ def _get_resolved_checkpoint_files(
                     if resolved_archive_file is not None:
                         is_sharded = True
                     elif use_safetensors:
-                        if revision == "main" and can_auto_convert:
+                        if revision == "main" and not is_offline_mode():
                             resolved_archive_file, revision, is_sharded = auto_conversion(
                                 pretrained_model_name_or_path, **cached_file_kwargs
                             )
@@ -660,7 +552,8 @@ def _get_resolved_checkpoint_files(
                             raise OSError(
                                 f"{pretrained_model_name_or_path} does not appear to have a file named"
                                 f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} "
-                                "and thus cannot be loaded with `safetensors`. Please do not set `use_safetensors=True`."
+                                "and thus cannot be loaded with `safetensors`. Please make sure that the model has "
+                                "been saved with `safe_serialization=True` or do not set `use_safetensors=True`."
                             )
                     else:
                         # This repo has no safetensors file of any kind, we switch to PyTorch.
@@ -668,8 +561,6 @@ def _get_resolved_checkpoint_files(
                         resolved_archive_file = cached_file(
                             pretrained_model_name_or_path, filename, **cached_file_kwargs
                         )
-                # Then try `.bin` files
                 if resolved_archive_file is None and filename == _add_variant(WEIGHTS_NAME, variant):
                     # Maybe the checkpoint is sharded, we try to grab the index name in this case.
                     resolved_archive_file = cached_file(
@@ -679,38 +570,67 @@ def _get_resolved_checkpoint_files(
                     )
                     if resolved_archive_file is not None:
                         is_sharded = True
-                # If we have a match, but it's `.bin` format, try to launch safetensors conversion for next time
-                if resolved_archive_file is not None:
-                    safe_weights_name = SAFE_WEIGHTS_INDEX_NAME if is_sharded else SAFE_WEIGHTS_NAME
-                    if (
-                        filename in [WEIGHTS_NAME, WEIGHTS_INDEX_NAME]
-                        and not has_file(pretrained_model_name_or_path, safe_weights_name, **has_file_kwargs)
-                        and can_auto_convert
-                    ):
-                        Thread(
-                            target=auto_conversion,
-                            args=(pretrained_model_name_or_path,),
-                            kwargs={"ignore_errors_during_conversion": False, **cached_file_kwargs},
-                            name="Thread-auto_conversion",
-                        ).start()
-                # If no match, raise appropriare errors
-                else:
-                    # Otherwise, no PyTorch file was found
-                    if variant is not None and has_file(
-                        pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs
-                    ):
-                        raise OSError(
-                            f"{pretrained_model_name_or_path} does not appear to have a file named"
-                            f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file without the variant"
-                            f" {variant}. Use `variant=None` to load this model from those weights."
-                        )
+                if not local_files_only and not is_offline_mode():
+                    if resolved_archive_file is not None:
+                        # In a CI environment (CircleCI / Github Actions workflow runs) or in a pytest run,
+                        # we set `DISABLE_SAFETENSORS_CONVERSION=true` to prevent the conversion.
+                        if (
+                            filename in [WEIGHTS_NAME, WEIGHTS_INDEX_NAME]
+                            and os.getenv("DISABLE_SAFETENSORS_CONVERSION", None) != "true"
+                        ):
+                            # If the PyTorch file was found, check if there is a safetensors file on the repository
+                            # If there is no safetensors file on the repositories, start an auto conversion
+                            safe_weights_name = SAFE_WEIGHTS_INDEX_NAME if is_sharded else SAFE_WEIGHTS_NAME
+                            has_file_kwargs = {
+                                "revision": revision,
+                                "proxies": proxies,
+                                "token": token,
+                                "cache_dir": cache_dir,
+                                "local_files_only": local_files_only,
+                            }
+                            cached_file_kwargs = {
+                                "cache_dir": cache_dir,
+                                "force_download": force_download,
+                                "local_files_only": local_files_only,
+                                "user_agent": user_agent,
+                                "subfolder": subfolder,
+                                "_raise_exceptions_for_gated_repo": False,
+                                "_raise_exceptions_for_missing_entries": False,
+                                "_commit_hash": commit_hash,
+                                **has_file_kwargs,
+                            }
+                            if (
+                                not has_file(pretrained_model_name_or_path, safe_weights_name, **has_file_kwargs)
+                                and not is_remote_code
+                            ):
+                                Thread(
+                                    target=auto_conversion,
+                                    args=(pretrained_model_name_or_path,),
+                                    kwargs={"ignore_errors_during_conversion": True, **cached_file_kwargs},
+                                    name="Thread-auto_conversion",
+                                ).start()
                     else:
-                        raise OSError(
-                            f"{pretrained_model_name_or_path} does not appear to have a file named"
-                            f" {_add_variant(WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_NAME, variant)}."
-                        )
+                        # Otherwise, no PyTorch file was found
+                        has_file_kwargs = {
+                            "revision": revision,
+                            "proxies": proxies,
+                            "token": token,
+                            "cache_dir": cache_dir,
+                            "local_files_only": local_files_only,
+                        }
+                        if variant is not None and has_file(
+                            pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs
+                        ):
+                            raise OSError(
+                                f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file without the variant"
+                                f" {variant}. Use `variant=None` to load this model from those weights."
+                            )
+                        else:
+                            raise OSError(
+                                f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                f" {_add_variant(WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_NAME, variant)}."
+                            )
             except OSError:
                 # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
@@ -777,20 +697,22 @@ def _get_resolved_checkpoint_files(
 def _get_dtype(
-    dtype: str | torch.dtype | dict | None,
-    checkpoint_files: list[str] | None,
+    cls,
+    dtype: Optional[Union[str, torch.dtype, dict]],
+    checkpoint_files: Optional[list[str]],
     config: PreTrainedConfig,
-    sharded_metadata: dict | None,
-    state_dict: dict | None,
+    sharded_metadata: Optional[dict],
+    state_dict: Optional[dict],
     weights_only: bool,
-    hf_quantizer: HfQuantizer | None = None,
-) -> tuple[PreTrainedConfig, torch.dtype]:
+) -> tuple[PreTrainedConfig, Optional[torch.dtype], Optional[torch.dtype]]:
     """Find the correct `dtype` to use based on provided arguments. Also update the `config` based on the
     inferred dtype. We do the following:
-    1. If dtype is "auto", we try to read the config, else auto-detect dtype from the loaded state_dict, by checking
-    its first weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype
-    2. Else, use the dtype provided as a dict or str
+    1. If dtype is not None, we use that dtype
+    2. If dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first
+        weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype
+    we also may have config.dtype available, but we won't rely on it till v5
     """
+    dtype_orig = None
     is_sharded = sharded_metadata is not None
     if dtype is not None:
@@ -815,46 +737,43 @@ def _get_dtype(
                     )
             elif hasattr(torch, dtype):
                 dtype = getattr(torch, dtype)
-            else:
-                raise ValueError(
-                    "`dtype` provided as a `str` can only be `'auto'`, or a string representation of a valid `torch.dtype`"
-                )
-            # cast it to a proper `torch.dtype` object
-            dtype = getattr(torch, dtype) if isinstance(dtype, str) else dtype
-        elif not isinstance(dtype, (dict, torch.dtype)):
+                config.dtype = dtype
+                for sub_config_key in config.sub_configs:
+                    if (sub_config := getattr(config, sub_config_key)) is not None:
+                        sub_config.dtype = dtype
+        elif isinstance(dtype, torch.dtype):
+            config.dtype = dtype
+            for sub_config_key in config.sub_configs:
+                if (sub_config := getattr(config, sub_config_key)) is not None:
+                    sub_config.dtype = dtype
+        elif isinstance(dtype, dict):
+            for key, curr_dtype in dtype.items():
+                if hasattr(config, key):
+                    value = getattr(config, key)
+                    curr_dtype = curr_dtype if not isinstance(curr_dtype, str) else getattr(torch, curr_dtype)
+                    value.dtype = curr_dtype
+            # main torch dtype for modules that aren't part of any sub-config
+            dtype = dtype.get("")
+            dtype = dtype if not isinstance(dtype, str) else getattr(torch, dtype)
+            config.dtype = dtype
+            if dtype is None:
+                dtype = torch.float32
+        else:
             raise ValueError(
                 f"`dtype` can be one of: `torch.dtype`, `'auto'`, a string of a valid `torch.dtype` or a `dict` with valid `dtype` "
                 f"for each sub-config in composite configs, but received {dtype}"
             )
-    else:
-        # set torch.get_default_dtype() (usually fp32) as the default dtype if `None` is provided
-        dtype = torch.get_default_dtype()
-    if hf_quantizer is not None:
-        hf_quantizer.update_dtype(dtype)
-    # Get the main dtype
-    if isinstance(dtype, dict):
-        main_dtype = dtype.get("", torch.get_default_dtype())
-        main_dtype = getattr(torch, main_dtype) if isinstance(main_dtype, str) else main_dtype
-        logger.warning_once(
-            "Using different dtypes per module is deprecated and will be removed in future versions "
-            "Setting different dtypes per backbone model might cause device errors downstream, therefore "
-            f"setting the dtype={main_dtype} for all modules."
-        )
+        dtype_orig = cls._set_default_dtype(dtype)
     else:
-        main_dtype = dtype
-    # Set it on the config and subconfigs
-    config.dtype = main_dtype
-    for sub_config_key in config.sub_configs:
-        if (sub_config := getattr(config, sub_config_key)) is not None:
-            sub_config.dtype = main_dtype
+        # set fp32 as the default dtype for BC
+        default_dtype = torch.get_default_dtype()
+        config.dtype = default_dtype
+        for key in config.sub_configs:
+            if (sub_config := getattr(config, key)) is not None:
+                sub_config.dtype = default_dtype
-    return config, main_dtype
+    return config, dtype, dtype_orig
 class PipelineParallel(Enum):
@@ -905,8 +824,13 @@ class ModuleUtilsMixin:
         return encoder_extended_attention_mask
     @staticmethod
-    def create_extended_attention_mask_for_decoder(input_shape, attention_mask):
-        device = attention_mask.device
+    def create_extended_attention_mask_for_decoder(input_shape, attention_mask, device=None):
+        if device is not None:
+            warnings.warn(
+                "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+        else:
+            device = attention_mask.device
         batch_size, seq_length = input_shape
         seq_ids = torch.arange(seq_length, device=device)
         causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
@@ -930,7 +854,8 @@ class ModuleUtilsMixin:
         self,
         attention_mask: Tensor,
         input_shape: tuple[int, ...],
-        dtype: torch.dtype | None = None,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
     ) -> Tensor:
         """
         Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
@@ -947,6 +872,12 @@ class ModuleUtilsMixin:
         if dtype is None:
             dtype = self.dtype
+        if not (attention_mask.dim() == 2 and self.config.is_decoder):
+            # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
+            if device is not None:
+                warnings.warn(
+                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+                )
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
         if attention_mask.dim() == 3:
@@ -955,9 +886,9 @@ class ModuleUtilsMixin:
             # Provided a padding mask of dimensions [batch_size, seq_length]
             # - if the model is a decoder, apply a causal mask in addition to the padding mask
             # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-            if getattr(self.config, "is_decoder", None):
+            if self.config.is_decoder:
                 extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
-                    input_shape, attention_mask
+                    input_shape, attention_mask, device
                 )
             else:
                 extended_attention_mask = attention_mask[:, None, None, :]
@@ -1038,52 +969,54 @@ class EmbeddingAccessMixin:
             `nn.Module`: A torch module mapping vocabulary to hidden states.
         """
+        # 1) Check if the model has an attribute named 'embed_tokens' (the standard input embedding layer
+        #  for most NLP models), and if so, return it.
         name = getattr(self, "_input_embed_layer", "embed_tokens")
-        # 1) Direct attribute (most NLP models).
         if (default_embedding := getattr(self, name, None)) is not None:
             return default_embedding
-        # 2) Nested embeddings (e.g., self.embeddings.patch_embedding for vision/audio models).
-        if hasattr(self, "embeddings") and hasattr(self.embeddings, name):
-            return getattr(self.embeddings, name)
-        # 3) Encoder/decoder wrappers (e.g., `self.model.embed_tokens` or similar overrides).
-        if hasattr(self, "model") and hasattr(self.model, name):
-            return getattr(self.model, name)
+        # 2) encoder/decoder and VLMs like `Gemma3nForConditionalGeneration`
-        if hasattr(self, "base_model"):
-            base_model = self.base_model
-            if base_model is not None and base_model is not self:
-                return base_model.get_input_embeddings()
+        if hasattr(self, "model") and hasattr(self.model, "embed_tokens"):
+            return self.model.embed_tokens
-        raise NotImplementedError(
-            f"`get_input_embeddings` not auto‑handled for {self.__class__.__name__}; please override in the subclass."
-        )
+        # 3) vanilla decoder‑only architectures
+        elif hasattr(self, "embed_tokens"):
+            return self.embed_tokens
+        else:
+            base_model = getattr(self, "base_model_prefix", None)
+            if base_model is not None:
+                base_model = getattr(self, base_model, None)
+                if base_model is not None and base_model is not self:
+                    return base_model.get_input_embeddings()
+            raise NotImplementedError(
+                f"`get_input_embeddings` not auto‑handled for {self.__class__.__name__}; "
+                "please override in the subclass."
+            )
     def set_input_embeddings(self, value: nn.Module):
         """Fallback setter that handles **~70%** of models in the code-base.
         Order of attempts:
-        1. `self.<_input_embed_layer>` (direct attribute)
-        2. `self.embeddings.<_input_embed_layer>` (nested embeddings for vision/audio models)
-        3. `self.model.<_input_embed_layer>` (encoder/decoder models)
-        4. delegate to the *base model* if one exists
-        5. otherwise raise `NotImplementedError` so subclasses still can (and
+        1. `self.model.embed_tokens`
+        2. `self.embed_tokens`
+        3. delegate to the *base model* if one exists
+        4. otherwise raise `NotImplementedError` so subclasses still can (and
             should) override for exotic layouts.
         """
+        # 1) encoder/decoder and VLMs like `Gemma3nForConditionalGeneration`
         name = getattr(self, "_input_embed_layer", "embed_tokens")
-        # 1) Direct attribute (most NLP models)
-        if hasattr(self, name):
-            setattr(self, name, value)
-        # 2) Nested embeddings (e.g., self.embeddings.patch_embedding for vision models)
-        elif hasattr(self, "embeddings") and hasattr(self.embeddings, name):
-            setattr(self.embeddings, name, value)
-        # 3) encoder/decoder and VLMs like `Gemma3nForConditionalGeneration`
-        elif hasattr(self, "model") and hasattr(self.model, name):
+        if hasattr(self, "model") and hasattr(self.model, name):
             setattr(self.model, name, value)
-        # 4) recurse once into the registered *base* model (e.g. for encoder/decoder)
-        elif hasattr(self, "base_model") and self.base_model is not self:
-            self.base_model.set_input_embeddings(value)
+        # 2) as well as vanilla decoder‑only architectures
+        elif hasattr(self, name):
+            setattr(self, name, value)
+        # 3) recurse once into the registered *base* model (e.g. for encoder/decoder)
+        elif getattr(self, self.base_model_prefix, self) is not self:
+            base_model = getattr(self, self.base_model_prefix, self)
+            base_model.set_input_embeddings(value)
         else:
             raise NotImplementedError(
                 f"`set_input_embeddings` not auto‑handled for {self.__class__.__name__}; please override in the subclass."
@@ -1144,7 +1077,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
     # to also prevent bfloat16 casting, use the _keep_in_fp32_modules_strict flag
     _keep_in_fp32_modules_strict = None
-    dtype_plan: dict[str, torch.dtype] | None = None
+    dtype_plan: Optional[dict[str, torch.dtype]] = None
     # a list of `re` patterns of `state_dict` keys that should be removed from the list of missing
     # keys we find (keys inside the model but not in the checkpoint) and avoid unnecessary warnings.
@@ -1204,7 +1137,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
     # Attributes used mainly in multimodal LLMs, though all models contain a valid field for these
     # Possible values are: text, image, video, audio and time
-    input_modalities: str | list[str] = "text"  # most models are text
+    input_modalities: Union[str, list[str]] = "text"  # most models are text
     @property
     @torch._dynamo.allow_in_graph
@@ -1295,11 +1228,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         self.config._attn_implementation_internal = self._check_and_adjust_attn_implementation(
             self.config._attn_implementation, is_init_check=True
         )
-        # Check the experts implementation is supported, or set it if not yet set (on the internal attr, to avoid
-        # setting it recursively)
-        self.config._experts_implementation_internal = self._check_and_adjust_experts_implementation(
-            self.config._experts_implementation
-        )
         if self.can_generate():
             self.generation_config = GenerationConfig.from_model_config(config)
@@ -1415,7 +1343,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
     def pp_plan(self, plan: dict[str, tuple[str, str]]):
         self._pp_plan = plan
-    def dequantize(self, dtype=None):
+    def dequantize(self):
         """
         Potentially dequantize the model in case it has been quantized by a quantization method that support
         dequantization.
@@ -1425,7 +1353,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         if hf_quantizer is None:
             raise ValueError("You need to first quantize your model in order to dequantize it")
-        return hf_quantizer.dequantize(self, dtype=dtype)
+        return hf_quantizer.dequantize(self)
     def _backward_compatibility_gradient_checkpointing(self):
         if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False):
@@ -1433,7 +1361,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             # Remove the attribute now that is has been consumed, so it's no saved in the config.
             delattr(self.config, "gradient_checkpointing")
-    def add_model_tags(self, tags: list[str] | str) -> None:
+    def add_model_tags(self, tags: Union[list[str], str]) -> None:
         r"""
         Add custom tags into the model that gets pushed to the Hugging Face Hub. Will
         not overwrite existing tags in the model.
@@ -1466,6 +1394,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                 self.model_tags.append(tag)
     @classmethod
+    @restore_default_dtype
     def _from_config(cls, config, **kwargs):
         """
         All context managers that the model should be initialized under go here.
@@ -1474,6 +1403,9 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             dtype (`torch.dtype`, *optional*):
                 Override the default `dtype` and load the model under this dtype.
         """
+        # when we init a model from within another model (e.g. VLMs) and dispatch on FA2
+        # a warning is raised that dtype should be fp16. Since we never pass dtype from within
+        # modeling code, we can try to infer it here same way as done in `from_pretrained`
         # For BC on the old `torch_dtype`
         dtype = kwargs.pop("dtype", config.dtype)
         if (torch_dtype := kwargs.pop("torch_dtype", None)) is not None:
@@ -1483,32 +1415,61 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         if isinstance(dtype, str):
             dtype = getattr(torch, dtype)
+        # override default dtype if needed
+        dtype_orig = None
+        if dtype is not None:
+            dtype_orig = cls._set_default_dtype(dtype)
         # If passing `attn_implementation` as kwargs, respect it (it will be applied recursively on subconfigs)
         if "attn_implementation" in kwargs:
             config._attn_implementation = kwargs.pop("attn_implementation")
-        # If passing `experts_implementation` as kwargs, respect it (it will be applied recursively on subconfigs)
-        if "experts_implementation" in kwargs:
-            config._experts_implementation = kwargs.pop("experts_implementation")
-        init_contexts = []
-        if dtype is not None:
-            init_contexts.append(local_torch_dtype(dtype, cls.__name__))
         if is_deepspeed_zero3_enabled() and not _is_quantized and not _is_ds_init_called:
             logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
             # this immediately partitions the model across all gpus, to avoid the overhead in time
             # and memory copying it on CPU or each GPU first
             import deepspeed
-            init_contexts.extend([deepspeed.zero.Init(config_dict_or_path=deepspeed_config()), set_zero3_state()])
+            init_contexts = [deepspeed.zero.Init(config_dict_or_path=deepspeed_config()), set_zero3_state()]
+            with ContextManagers(init_contexts):
+                model = cls(config, **kwargs)
-        # Instantiate the model
-        with ContextManagers(init_contexts):
+        else:
             model = cls(config, **kwargs)
+        # restore default dtype if it was modified
+        if dtype_orig is not None:
+            torch.set_default_dtype(dtype_orig)
         return model
+    @classmethod
+    def _set_default_dtype(cls, dtype: torch.dtype) -> torch.dtype:
+        """
+        Change the default dtype and return the previous one. This is needed when wanting to instantiate the model
+        under specific dtype.
+        Args:
+            dtype (`torch.dtype`):
+                a floating dtype to set to.
+        Returns:
+            `torch.dtype`: the original `dtype` that can be used to restore `torch.set_default_dtype(dtype)` if it was
+            modified. If it wasn't, returns `None`.
+        Note `set_default_dtype` currently only works with floating-point types and asserts if for example,
+        `torch.int64` is passed. So if a non-float `dtype` is passed this functions will throw an exception.
+        """
+        if not dtype.is_floating_point:
+            raise ValueError(
+                f"Can't instantiate {cls.__name__} model under dtype={dtype} since it is not a floating point dtype"
+            )
+        logger.info(f"Instantiating {cls.__name__} model under default dtype {dtype}.")
+        dtype_orig = torch.get_default_dtype()
+        torch.set_default_dtype(dtype)
+        return dtype_orig
     @property
     def base_model(self) -> nn.Module:
         """
@@ -1585,9 +1546,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                 return True
             if is_torch_xpu_available():
-                logger.info(
-                    f"Detect using FlashAttention2 (via kernel `{FLASH_ATTN_KERNEL_FALLBACK['flash_attention_2']}`) on XPU."
-                )
+                logger.info("Detect using FlashAttention2 (via kernel `kernels-community/flash-attn2`) on XPU.")
                 return True
             if importlib.util.find_spec("flash_attn") is None:
@@ -1756,22 +1715,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         return True
-    def _grouped_mm_can_dispatch(self) -> bool:
-        """
-        Check the availability of Grouped MM for a given model.
-        """
-        if not self._can_set_experts_implementation():
-            raise ValueError(f"{self.__class__.__name__} does not support setting experts implementation.")
-        if not is_grouped_mm_available():
-            raise ImportError(
-                "PyTorch Grouped MM requirements in Transformers are not met. Please install torch>=2.9.0."
-            )
-        # If no error raised by this point, we can return `True`
-        return True
     def _flex_attn_can_dispatch(self, is_init_check: bool = False) -> bool:
         """
         Check the availability of Flex Attention for a given model.
@@ -1800,7 +1743,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         return True
     def _check_and_adjust_attn_implementation(
-        self, attn_implementation: str | None, is_init_check: bool = False
+        self, attn_implementation: Optional[str], is_init_check: bool = False
     ) -> str:
         """
         Check that the `attn_implementation` exists and is supported by the models, and try to get the kernel from hub if
@@ -1821,12 +1764,9 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         """
         applicable_attn_implementation = attn_implementation
-        is_paged = attn_implementation is not None and attn_implementation.startswith("paged|")
         # If FA not installed, do not fail but use kernels instead
         requested_original_flash_attn = attn_implementation is not None and (
-            attn_implementation.removeprefix("paged|") == "flash_attention_2"
-            or attn_implementation.removeprefix("paged|") == "flash_attention_3"
+            attn_implementation == "flash_attention_2" or attn_implementation == "flash_attention_3"
         )
         if (
             requested_original_flash_attn
@@ -1835,23 +1775,19 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             and is_kernels_available()
             and not is_torch_npu_available()
         ):
-            applicable_attn_implementation = FLASH_ATTN_KERNEL_FALLBACK[attn_implementation.removeprefix("paged|")]
-            if is_torch_xpu_available() and attn_implementation.removeprefix("paged|") == "flash_attention_2":
-                # On XPU, kernels library is the native implementation
-                # Disabling this flag to avoid giving wrong fallbacks on errors and warnings
-                requested_original_flash_attn = False
-            if is_paged:
-                applicable_attn_implementation = f"paged|{applicable_attn_implementation}"
+            if attn_implementation.endswith("2"):
+                applicable_attn_implementation = "kernels-community/flash-attn2"
+                if is_torch_xpu_available():
+                    # On XPU, kernels library is the native implementation
+                    # Disabling this flag to avoid giving wrong fallbacks on errors and warnings
+                    requested_original_flash_attn = False
+            else:
+                applicable_attn_implementation = "kernels-community/vllm-flash-attn3"
         if is_kernel(applicable_attn_implementation):
             try:
                 # preload flash attention here to allow compile with fullgraph
-                if is_paged:
-                    lazy_import_paged_flash_attention(applicable_attn_implementation)
-                else:
-                    lazy_import_flash_attention(applicable_attn_implementation)
+                lazy_import_flash_attention(applicable_attn_implementation)
                 # log that we used kernel fallback if successful
                 if requested_original_flash_attn:
@@ -1875,25 +1811,12 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             )
             # preload flash attention here to allow compile with fullgraph
-            if is_flash_attention_requested(requested_attention_implementation=applicable_attn_implementation):
+            if "flash" in applicable_attn_implementation:
                 lazy_import_flash_attention(applicable_attn_implementation)
         return applicable_attn_implementation
-    def _check_and_adjust_experts_implementation(self, experts_implementation: str | None) -> str:
-        """
-        Check that the `experts_implementation` exists and is supported by the models.
-        Args:
-            experts_implementation (`str` or `None`):
-                The experts implementation to check for existence/validity.
-        Returns:
-            `str`: The final experts implementation to use.
-        """
-        applicable_experts_implementation = self.get_correct_experts_implementation(experts_implementation)
-        return applicable_experts_implementation
-    def get_correct_attn_implementation(self, requested_attention: str | None, is_init_check: bool = False) -> str:
+    def get_correct_attn_implementation(self, requested_attention: Optional[str], is_init_check: bool = False) -> str:
         applicable_attention = "sdpa" if requested_attention is None else requested_attention
         if applicable_attention not in ["eager"] + ALL_ATTENTION_FUNCTIONS.valid_keys():
             message = (
@@ -1927,33 +1850,13 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         return applicable_attention
-    def get_correct_experts_implementation(self, requested_experts: str | None) -> str:
-        applicable_experts = "grouped_mm" if requested_experts is None else requested_experts
-        if applicable_experts not in ["eager", "grouped_mm", "batched_mm"]:
-            message = (
-                f'Specified `experts_implementation="{applicable_experts}"` is not supported. The only possible arguments are '
-                '`experts_implementation="eager"`, `"experts_implementation=grouped_mm"` and `"experts_implementation=batched_mm"`.'
-            )
-            raise ValueError(message)
-        # Perform relevant checks
-        if applicable_experts == "grouped_mm":
-            try:
-                self._grouped_mm_can_dispatch()
-            except (ValueError, ImportError) as e:
-                if requested_experts == "grouped_mm":
-                    raise e
-                applicable_experts = "eager"
-        return applicable_experts
     @classmethod
     def _can_set_attn_implementation(cls) -> bool:
         """Detect whether the class supports setting its attention implementation dynamically. It is an ugly check based on
         opening the file, but avoids maintaining yet another property flag.
         """
         class_file = sys.modules[cls.__module__].__file__
-        with open(class_file, "r", encoding="utf-8") as f:
+        with open(class_file, "r") as f:
             code = f.read()
         # heuristic -> if we find those patterns, the model uses the correct interface
         if re.search(r"class \w+Attention\(nn.Module\)", code):
@@ -1965,18 +1868,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             # If no attention layer, assume `True`. Most probably a multimodal model or inherits from existing models
             return True
-    @classmethod
-    def _can_set_experts_implementation(cls) -> bool:
-        """Detect whether the class supports setting its experts implementation dynamically. It is an ugly check based on
-        opening the file, but avoids maintaining yet another property flag.
-        """
-        class_file = sys.modules[cls.__module__].__file__
-        with open(class_file, "r", encoding="utf-8") as f:
-            code = f.read()
-        # heuristic -> if we the use_experts_implementation decorator is used, then we can set it
-        return "@use_experts_implementation" in code
-    def set_attn_implementation(self, attn_implementation: str | dict):
+    def set_attn_implementation(self, attn_implementation: Union[str, dict]):
         """
         Set the requested `attn_implementation` for this model.
@@ -2075,50 +1967,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                     if hasattr(subconfig, "_attn_was_changed"):
                         del subconfig._attn_was_changed
-    def set_experts_implementation(self, experts_implementation: str | dict):
-        """
-        Set the requested `experts_implementation` for this model.
-        Args:
-            experts_implementation (`str` or `dict`):
-                The experts implementation to set for this model. It can be either a `str`, in which case it will be
-                dispatched to all submodels if relevant, or a `dict` where keys are the sub_configs name, in which case each
-                submodel will dispatch the corresponding value.
-        """
-        requested_implementation = (
-            experts_implementation
-            if not isinstance(experts_implementation, dict)
-            else experts_implementation.get("", self.config._experts_implementation)
-        )
-        if requested_implementation != self.config._experts_implementation:
-            requested_implementation = self._check_and_adjust_experts_implementation(requested_implementation)
-            # Apply the change (on the internal attr, to avoid setting it recursively)
-            self.config._experts_implementation_internal = requested_implementation
-        # Apply it to all submodels as well
-        for submodule in self.modules():
-            # We found a submodel (which is not self) with a different config (otherwise, it may be the same "actual model",
-            # e.g. ForCausalLM has a Model inside, but no need to check it again)
-            if (
-                submodule is not self
-                and isinstance(submodule, PreTrainedModel)
-                and submodule.config.__class__ != self.config.__class__
-            ):
-                # Set the experts on the submodule
-                sub_implementation = requested_implementation
-                if isinstance(experts_implementation, dict):
-                    for subconfig_key in self.config.sub_configs:
-                        # We need to check for exact object match here, with `is`
-                        if getattr(self.config, subconfig_key) is submodule.config:
-                            sub_implementation = experts_implementation.get(
-                                subconfig_key, submodule.config._experts_implementation
-                            )
-                            break
-                # Check the module can use correctly, otherwise we raise an error if requested experts can't be set for submodule
-                sub_implementation = submodule.get_correct_experts_implementation(sub_implementation)
-                submodule.config._experts_implementation_internal = sub_implementation
     def enable_input_require_grads(self):
         """
         Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
@@ -2130,18 +1978,14 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         hooks = []
         seen_modules = set()
-        found_embeddings = False
         for module in self.modules():
             if not (isinstance(module, PreTrainedModel) and hasattr(module, "get_input_embeddings")):
                 continue
-            try:
-                input_embeddings = module.get_input_embeddings()
-            except NotImplementedError:
-                continue
+            input_embeddings = module.get_input_embeddings()
-            if input_embeddings is None or not hasattr(input_embeddings, "register_forward_hook"):
+            if input_embeddings is None:
                 continue
             embedding_id = id(input_embeddings)
@@ -2150,18 +1994,11 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             seen_modules.add(embedding_id)
             hooks.append(input_embeddings.register_forward_hook(make_inputs_require_grads))
-            found_embeddings = True
         self._require_grads_hooks = hooks
         if hooks:
             # for BC
             self._require_grads_hook = hooks[0]
-        if not found_embeddings:
-            logger.warning_once(
-                f"{self.__class__.__name__} does not expose input embeddings. Gradients cannot flow back to the token "
-                "embeddings when using adapters or gradient checkpointing. Override `get_input_embeddings` to fully "
-                "support those features, or set `_input_embed_layer` to the attribute name that holds the embeddings."
-            )
     def disable_input_require_grads(self):
         """
@@ -2178,7 +2015,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         if hasattr(self, "_require_grads_hook"):
             del self._require_grads_hook
-    def get_encoder(self, modality: str | None = None):
+    def get_encoder(self, modality: Optional[str] = None):
         """
         Best-effort lookup of the *encoder* module. If provided with `modality` argument,
         it looks for a modality-specific encoder in multimodal models (e.g. "image_encoder")
@@ -2210,7 +2047,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         # If this is a base transformer model (no encoder/model attributes), return self
         return self
-    def set_encoder(self, encoder, modality: str | None = None):
+    def set_encoder(self, encoder, modality: Optional[str] = None):
         """
         Symmetric setter. Mirrors the lookup logic used in `get_encoder`.
         """
@@ -2267,6 +2104,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         possible_module_names = ["language_model", "text_model", "decoder"]
         for name in possible_module_names:
             if hasattr(self, name):
+                print(name)
                 setattr(self, name, decoder)
                 return
@@ -2296,13 +2134,14 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.ConvTranspose1d, nn.ConvTranspose2d)):
             if getattr(module, "weight", None) is not None:
                 init.normal_(module.weight, mean=0.0, std=std)
-            if module.bias is not None:
+            if getattr(module, "bias", None) is not None:
                 init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
-            init.normal_(module.weight, mean=0.0, std=std)
-            # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
-            if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
-                init.zeros_(module.weight[module.padding_idx])
+            if getattr(module, "weight", None) is not None:
+                init.normal_(module.weight, mean=0.0, std=std)
+                # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
+                if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
+                    init.zeros_(module.weight[module.padding_idx])
         elif isinstance(module, nn.MultiheadAttention):
             # This uses torch's original init
             module._reset_parameters()
@@ -2314,25 +2153,10 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             or "RMSNorm" in module.__class__.__name__
         ):
             # Norms can exist without weights (in which case they are None from torch primitives)
-            if getattr(module, "weight", None) is not None:
+            if hasattr(module, "weight") and module.weight is not None:
                 init.ones_(module.weight)
-            if getattr(module, "bias", None) is not None:
+            if hasattr(module, "bias") and module.bias is not None:
                 init.zeros_(module.bias)
-            # And the potential buffers for the BatchNorms
-            if getattr(module, "running_mean", None) is not None:
-                init.zeros_(module.running_mean)
-                init.ones_(module.running_var)
-                init.zeros_(module.num_batches_tracked)
-        # This matches all the usual RotaryEmbeddings modules
-        elif "RotaryEmbedding" in module.__class__.__name__ and hasattr(module, "original_inv_freq"):
-            rope_fn = (
-                ROPE_INIT_FUNCTIONS[module.rope_type]
-                if module.rope_type != "default"
-                else module.compute_default_rope_parameters
-            )
-            buffer_value, _ = rope_fn(module.config)
-            init.copy_(module.inv_freq, buffer_value)
-            init.copy_(module.original_inv_freq, buffer_value)
     def _initialize_weights(self, module):
         """
@@ -2437,10 +2261,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         tied_mapping = self._tied_weights_keys
         # If the config does not specify any tying, return empty dict
-        # NOTE: not all modules have `tie_word_embeddings` attr, for example vision-only
-        # modules do not have any word embeddings!
-        tie_word_embeddings = getattr(self.config, "tie_word_embeddings", False)
-        if not tie_word_embeddings:
+        if not self.config.tie_word_embeddings and not self.config.tie_encoder_decoder:
             return {}
         # If None, return empty dict
         elif tied_mapping is None:
@@ -2486,7 +2307,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         return expanded_tied_weights
-    def tie_weights(self, missing_keys: set[str] | None = None, recompute_mapping: bool = True):
+    def tie_weights(self, missing_keys: Optional[set[str]] = None, recompute_mapping: bool = True):
         """
         Tie the model weights. If `recompute_mapping=False` (default when called internally), it will rely on the
         `model.all_tied_weights_keys` attribute, containing the `{target: source}` mapping for the tied params.
@@ -2506,26 +2327,30 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         tied_keys = list(tied_keys.items())
         for i, (target_param_name, source_param_name) in enumerate(tied_keys):
+            # Usually we tie a single target to a single source, but when both are missing we may later tie
+            # both the source and target to a third "backup" parameter that is present in the checkpoint, so we use
+            # a list here
+            target_param_names = [target_param_name]
             # This is `from_pretrained` -> let's check symmetrically in case the source key is not present
             if missing_keys is not None:
                 remove_from_missing = True
                 source_is_there = source_param_name not in missing_keys
                 target_is_there = target_param_name not in missing_keys
                 # Both are already present -> it means the config is wrong and do not reflect the actual
-                # checkpoint -> let's raise a warning and NOT tie them
+                # checkpoint -> let's raise a warning and do nothing
                 if source_is_there and target_is_there:
                     logger.warning(
                         f"The tied weights mapping and config for this model specifies to tie {source_param_name} to "
                         f"{target_param_name}, but both are present in the checkpoints, so we will NOT tie them. "
                         "You should update the config with `tie_word_embeddings=False` to silence this warning"
                     )
-                    # Remove from internal attribute to correctly reflect actual tied weights
-                    self.all_tied_weights_keys.pop(target_param_name)
                     # Skip to next iteration
                     continue
                 # We're missing the source but we have the target -> we swap them, tying the parameter that exists
                 elif not source_is_there and target_is_there:
                     target_param_name, source_param_name = source_param_name, target_param_name
+                    target_param_names = [target_param_name]
                 # Both are missing -> check other keys in case more than 2 keys are tied to the same weight
                 elif not source_is_there and not target_is_there:
                     for target_backup, source_backup in tied_keys[i + 1 :]:
@@ -2534,10 +2359,10 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                         if source_backup == source_param_name:
                             target_backup_is_there = target_backup not in missing_keys
                             # If the target is present, we found the correct weight to tie into (we know the source is missing)
-                            # Note here that we do not tie the missing source right now as well, as it will be done anyway when
-                            # the pair (target_backup, source_backup) becomes the main pair (target_param_name, source_param_name)
                             if target_backup_is_there:
                                 source_param_name = target_backup
+                                # Append the source as well, since both are missing we'll tie both
+                                target_param_names.append(source_param_name)
                                 break
                     # If we did not break from the loop, it was impossible to find a source key -> let's raise
                     else:
@@ -2553,18 +2378,19 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             # Perform the actual tying
             source_param = self.get_parameter_or_buffer(source_param_name)
-            if "." in target_param_name:
-                parent_name, name = target_param_name.rsplit(".", 1)
-                parent = self.get_submodule(parent_name)
-            else:
-                name = target_param_name
-                parent = self
-            # Tie the weights
-            setattr(parent, name, source_param)
-            self._adjust_bias(parent, source_param)
-            # Remove from missing if necesary
-            if missing_keys is not None and remove_from_missing:
-                missing_keys.discard(target_param_name)
+            for target_param_name in target_param_names:
+                if "." in target_param_name:
+                    parent_name, name = target_param_name.rsplit(".", 1)
+                    parent = self.get_submodule(parent_name)
+                else:
+                    name = target_param_name
+                    parent = self
+                # Tie the weights
+                setattr(parent, name, source_param)
+                self._adjust_bias(parent, source_param)
+                # Remove from missing if necesary
+                if missing_keys is not None and remove_from_missing:
+                    missing_keys.discard(target_param_name)
     def _adjust_bias(self, output_embeddings, input_embeddings):
         if getattr(output_embeddings, "bias", None) is not None and hasattr(output_embeddings, "weight"):
@@ -2609,8 +2435,8 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
     def resize_token_embeddings(
         self,
-        new_num_tokens: int | None = None,
-        pad_to_multiple_of: int | None = None,
+        new_num_tokens: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
         mean_resizing: bool = True,
     ) -> nn.Embedding:
         """
@@ -2690,7 +2516,10 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                 new_num_tokens = new_embeddings.weight.shape[0]
         # if word embeddings are not tied, make sure that lm head is resized as well
-        if self.get_output_embeddings() is not None:
+        if (
+            self.get_output_embeddings() is not None
+            and not self.config.get_text_config(decoder=True).tie_word_embeddings
+        ):
             old_lm_head = self.get_output_embeddings()
             if isinstance(old_lm_head, torch.nn.Embedding):
                 new_lm_head = self._get_resized_embeddings(old_lm_head, new_num_tokens, mean_resizing=mean_resizing)
@@ -2708,8 +2537,8 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
     def _get_resized_embeddings(
         self,
         old_embeddings: nn.Embedding,
-        new_num_tokens: int | None = None,
-        pad_to_multiple_of: int | None = None,
+        new_num_tokens: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
         mean_resizing: bool = True,
     ) -> nn.Embedding:
         """
@@ -2866,7 +2695,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
     def _get_resized_lm_head(
         self,
         old_lm_head: nn.Linear,
-        new_num_tokens: int | None = None,
+        new_num_tokens: Optional[int] = None,
         transposed: bool = False,
         mean_resizing: bool = True,
     ) -> nn.Linear:
@@ -3063,7 +2892,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             f"overwrite this method in the class {self.__class__} in `modeling_{self.__class__.__module__}.py`"
         )
-    def get_position_embeddings(self) -> nn.Embedding | tuple[nn.Embedding]:
+    def get_position_embeddings(self) -> Union[nn.Embedding, tuple[nn.Embedding]]:
         raise NotImplementedError(
             f"`get_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should "
             f"overwrite this method in the class {self.__class__} in `modeling_{self.__class__.__module__}.py`"
@@ -3074,8 +2903,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         Maybe initializes weights. If using a custom `PreTrainedModel`, you need to implement any
         initialization logic in `_init_weights`.
         """
-        # If we are initializing on meta device, there is no point in trying to run inits
-        if get_torch_context_manager_or_global_device() != torch.device("meta"):
+        if _init_weights:
             # Initialize weights
             self.initialize_weights()
             # Tie weights needs to be called here, but it can use the pre-computed `all_tied_weights_keys`
@@ -3096,7 +2924,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
         if gradient_checkpointing_kwargs is None:
-            gradient_checkpointing_kwargs = {"use_reentrant": False}
+            gradient_checkpointing_kwargs = {"use_reentrant": True}
         gradient_checkpointing_func = functools.partial(checkpoint, **gradient_checkpointing_kwargs)
@@ -3113,10 +2941,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                 "Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model."
             )
-        needs_embedding_grads = self.main_input_name == "input_ids"
-        # we use that also to detect whether or not we have to raise if embeddings are missing (the submodel might not have embeddings at all)
-        enable_input_grads = needs_embedding_grads or getattr(self, "_hf_peft_config_loaded", False)
-        if enable_input_grads:
+        if getattr(self, "_hf_peft_config_loaded", False):
             # When using PEFT + gradient checkpointing + Trainer we need to make sure the input has requires_grad=True
             # we do it also on PEFT: https://github.com/huggingface/peft/blob/85013987aa82aa1af3da1236b6902556ce3e483e/src/peft/peft_model.py#L334
             # When training with PEFT, only LoRA layers will have requires grad set to True, but the output of frozen layers need to propagate
@@ -3174,13 +2999,15 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
     def save_pretrained(
         self,
-        save_directory: str | os.PathLike,
+        save_directory: Union[str, os.PathLike],
         is_main_process: bool = True,
-        state_dict: dict | None = None,
+        state_dict: Optional[dict] = None,
+        save_function: Callable = torch.save,
         push_to_hub: bool = False,
-        max_shard_size: int | str = "50GB",
-        variant: str | None = None,
-        token: str | bool | None = None,
+        max_shard_size: Union[int, str] = "5GB",
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+        token: Optional[Union[str, bool]] = None,
         save_peft_format: bool = True,
         save_original_format: bool = True,
         **kwargs,
@@ -3200,13 +3027,18 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                 The state dictionary of the model to save. Will default to `self.state_dict()`, but can be used to only
                 save parts of the model or if special precautions need to be taken when recovering the state dictionary
                 of a model (like when using model parallelism).
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
+                need to replace `torch.save` by another method.
             push_to_hub (`bool`, *optional*, defaults to `False`):
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            max_shard_size (`int` or `str`, *optional*, defaults to `"50GB"`):
+            max_shard_size (`int` or `str`, *optional*, defaults to `"5GB"`):
                 The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
                 lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
+                We default it to 5GB in order for models to be able to run easily on free-tier google colab instances
+                without CPU OOM issues.
                 <Tip warning={true}>
@@ -3215,8 +3047,10 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                 </Tip>
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
             variant (`str`, *optional*):
-                If specified, weights are saved in the format model.<variant>.safetensors.
+                If specified, weights are saved in the format pytorch_model.<variant>.bin.
             token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `hf auth login` (stored in `~/.huggingface`).
@@ -3238,7 +3072,9 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         hf_quantizer = getattr(self, "hf_quantizer", None)
         quantization_serializable = (
-            hf_quantizer is not None and isinstance(hf_quantizer, HfQuantizer) and hf_quantizer.is_serializable()
+            hf_quantizer is not None
+            and isinstance(hf_quantizer, HfQuantizer)
+            and hf_quantizer.is_serializable(safe_serialization=safe_serialization)
         )
         if hf_quantizer is not None and not _hf_peft_config_loaded and not quantization_serializable:
@@ -3247,6 +3083,12 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                 " the logger on the traceback to understand the reason why the quantized model is not serializable."
             )
+        if "save_config" in kwargs:
+            warnings.warn(
+                "`save_config` is deprecated and will be removed in v5 of Transformers. Use `is_main_process` instead."
+            )
+            is_main_process = kwargs.pop("save_config")
         # we need to check against tp_size, not tp_plan, as tp_plan is substituted to the class one
         if self._tp_size is not None and not is_huggingface_hub_greater_or_equal("0.31.4"):
             raise ImportError(
@@ -3268,7 +3110,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         metadata = {}
         if hf_quantizer is not None:
-            state_dict, metadata = hf_quantizer.get_state_dict_and_metadata(self)
+            state_dict, metadata = hf_quantizer.get_state_dict_and_metadata(self, safe_serialization)
         metadata["format"] = "pt"
         # Only save the model itself if we are using distributed training
@@ -3321,22 +3163,28 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                 current_peft_config = self.peft_config[active_adapter]
                 current_peft_config.save_pretrained(save_directory)
-        # Get the model state_dict
+        # for offloaded modules
+        module_map = {}
+        # Save the model
         if state_dict is None:
-            state_dict = model_to_save.state_dict()
+            # if any model parameters are offloaded, make module map
+            if (
+                hasattr(self, "hf_device_map")
+                and len(set(self.hf_device_map.values())) > 1
+                and ("cpu" in self.hf_device_map.values() or "disk" in self.hf_device_map.values())
+            ):
+                warnings.warn(
+                    "Attempting to save a model with offloaded modules. Ensure that unallocated cpu memory exceeds the `shard_size` (5GB default)"
+                )
+                for name, module in model_to_save.named_modules():
+                    if name == "":
+                        continue
+                    module_state_dict = module.state_dict()
-        # if any model parameters are offloaded, we need to know it for later
-        is_offloaded = False
-        if (
-            hasattr(self, "hf_device_map")
-            and len(set(self.hf_device_map.values())) > 1
-            and ("cpu" in self.hf_device_map.values() or "disk" in self.hf_device_map.values())
-        ):
-            is_offloaded = True
-            warnings.warn(
-                "Attempting to save a model with offloaded modules. Ensure that unallocated cpu memory "
-                "exceeds the `shard_size` (50GB default)"
-            )
+                    for key in module_state_dict:
+                        module_map[name + f".{key}"] = module
+            state_dict = model_to_save.state_dict()
         # Translate state_dict from smp to hf if saving with smp >= 1.10
         if IS_SAGEMAKER_MP_POST_1_10:
@@ -3354,19 +3202,86 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         if self._tp_size is not None:
             state_dict = replace_state_dict_local_with_dtensor(state_dict, self._tp_plan, self._device_mesh)
-        # Remove tied weights as safetensors do not handle them
-        state_dict = remove_tied_weights_from_state_dict(state_dict, model_to_save)
+        if safe_serialization:
+            # TODO: fix safe_serialization for tied weights
+            # Safetensors does not allow tensor aliasing.
+            # We're going to remove aliases before saving
+            ptrs = collections.defaultdict(list)
+            for name, tensor in state_dict.items():
+                if not isinstance(tensor, torch.Tensor):
+                    # Sometimes in the state_dict we have non-tensor objects.
+                    # e.g. in bitsandbytes we have some `str` objects in the state_dict
+                    # In the non-tensor case, fall back to the pointer of the object itself
+                    ptrs[id(tensor)].append(name)
+                elif tensor.device.type == "meta":
+                    # In offloaded cases, there may be meta tensors in the state_dict.
+                    # For these cases, key by the pointer of the original tensor object
+                    # (state_dict tensors are detached and therefore no longer shared)
+                    tensor = self.get_parameter(name)
+                    ptrs[id(tensor)].append(name)
+                else:
+                    ptrs[id_tensor_storage(tensor)].append(name)
+            shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
+            # Recursively descend to find tied weight keys
+            _tied_weights_keys = set(_get_tied_weight_keys(self))
+            error_names = []
+            to_delete_names = set()
+            for names in shared_ptrs.values():
+                # Removing the keys which are declared as known duplicates on
+                # load. This allows to make sure the name which is kept is consistent.
+                if _tied_weights_keys is not None:
+                    found = 0
+                    for name in sorted(names):
+                        matches_pattern = any(re.search(pat, name) for pat in _tied_weights_keys)
+                        if matches_pattern and name in state_dict:
+                            found += 1
+                            if found < len(names):
+                                to_delete_names.add(name)
+            # We are entering a place where the weights and the transformers configuration do NOT match.
+            shared_names, disjoint_names = _find_disjoint(shared_ptrs.values(), state_dict)
+            # Those are actually tensor sharing but disjoint from each other, we can safely clone them
+            # Reloaded won't have the same property, but it shouldn't matter in any meaningful way.
+            for name in disjoint_names:
+                state_dict[name] = state_dict[name].clone()
+            # When not all duplicates have been cleaned, still remove those keys, but put a clear warning.
+            # If the link between tensors was done at runtime then `from_pretrained` will not get
+            # the key back leading to random tensor. A proper warning will be shown
+            # during reload (if applicable), but since the file is not necessarily compatible with
+            # the config, better show a proper warning.
+            shared_names, identical_names = _find_identical(shared_names, state_dict)
+            # delete tensors that have identical storage
+            for inames in identical_names:
+                known = inames.intersection(to_delete_names)
+                for name in known:
+                    del state_dict[name]
+                unknown = inames.difference(to_delete_names)
+                if len(unknown) > 1:
+                    error_names.append(unknown)
+            if shared_names:
+                error_names.extend(shared_names)
+            if len(error_names) > 0:
+                raise RuntimeError(
+                    f"The weights trying to be saved contained shared tensors {error_names} which are not properly defined. We found `_tied_weights_keys` to be: {_tied_weights_keys}.\n"
+                    "This can also just mean that the module's tied weight keys are wrong vs the actual tied weights in the model.",
+                )
         # Revert all renaming and/or weight operations
-        if save_original_format and not _hf_peft_config_loaded:
-            state_dict = revert_weight_conversion(model_to_save, state_dict)
+        if save_original_format:
+            state_dict = revert_weight_conversion(self, state_dict)
         # Shard the model if it is too big.
         if not _hf_peft_config_loaded:
-            weights_name = SAFE_WEIGHTS_NAME
+            weights_name = SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
             weights_name = _add_variant(weights_name, variant)
         else:
-            weights_name = ADAPTER_SAFE_WEIGHTS_NAME
+            weights_name = ADAPTER_SAFE_WEIGHTS_NAME if safe_serialization else ADAPTER_WEIGHTS_NAME
         filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
         state_dict_split = split_torch_state_dict_into_shards(
@@ -3399,45 +3314,57 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                 and reg.fullmatch(filename_no_suffix) is not None
             ):
                 os.remove(full_filename)
         # Save the model
-        for shard_file, tensor_names in logging.tqdm(
-            state_dict_split.filename_to_tensors.items(), desc="Writing model shards"
-        ):
-            filename = os.path.join(save_directory, shard_file)
-            shard_state_dict = {}
-            for tensor_name in tensor_names:
-                # Get the tensor, and remove it from state_dict to avoid keeping the ref
-                tensor = state_dict.pop(tensor_name)
-                # In case of TP, get the full parameter back
-                if _is_dtensor_available and isinstance(tensor, DTensor):
-                    tensor = tensor.full_tensor()
+        filename_to_tensors = state_dict_split.filename_to_tensors.items()
+        if module_map:
+            filename_to_tensors = logging.tqdm(filename_to_tensors, desc="Saving checkpoint shards")
+        for shard_file, tensors in filename_to_tensors:
+            shard = {}
+            for tensor in tensors:
+                if _is_dtensor_available and isinstance(state_dict[tensor], DTensor):
+                    full_tensor = state_dict[tensor].full_tensor()
                     # to get the correctly ordered tensor we need to repack if packed
-                    if _get_parameter_tp_plan(tensor_name, self._tp_plan) == "local_packed_rowwise":
-                        tensor = repack_weights(tensor, -1, self._tp_size, 2)
-                # If the param was offloaded, we need to load it back from disk to resave it. It's a strange pattern,
-                # but it would otherwise not be contained in the saved shard if we were to simply move the file
-                # or something
-                if is_offloaded and tensor.device.type == "meta":
-                    tensor = load_offloaded_parameter(model_to_save, tensor_name)
-                # only do contiguous after it's permuted correctly in case of TP
-                shard_state_dict[tensor_name] = tensor.contiguous()
+                    if _get_parameter_tp_plan(tensor, self._tp_plan) == "local_packed_rowwise":
+                        full_tensor = repack_weights(full_tensor, -1, self._tp_size, 2)
+                    shard[tensor] = full_tensor.contiguous()  # only do contiguous after it's permuted correctly
+                else:
+                    shard[tensor] = state_dict[tensor].contiguous()
+                # delete reference, see https://github.com/huggingface/transformers/pull/34890
+                del state_dict[tensor]
+            # remake shard with onloaded parameters if necessary
+            if module_map:
+                # init state_dict for this shard
+                shard_state_dict = dict.fromkeys(shard, "")
+                for module_name in shard:
+                    # note that get_state_dict_from_offload can update with meta tensors
+                    # if both a parent module and its descendant are offloaded
+                    tensor = shard_state_dict[module_name]
+                    if tensor == "" or (isinstance(tensor, torch.Tensor) and tensor.device.type == "meta"):
+                        # update state dict with onloaded parameters
+                        module = module_map[module_name]
+                        shard_state_dict = get_state_dict_from_offload(module, module_name, shard_state_dict)
+                # assign shard to be the completed state dict
+                shard = shard_state_dict
+                del shard_state_dict
+                gc.collect()
+            if safe_serialization:
+                # At some point we will need to deal better with save_function (used for TPU and other distributed
+                # joyfulness), but for now this enough. # TODO: we should def parallelize this we are otherwise just waiting
+                # too much before scheduling the next write when its in a different file
+                safe_save_file(shard, os.path.join(save_directory, shard_file), metadata=metadata)
+            else:
+                save_function(shard, os.path.join(save_directory, shard_file))
-            # TODO: it would be very nice to do the writing concurrently, but safetensors never releases the GIL,
-            # so it's not possible for now....
-            # Write the shard to disk
-            safe_save_file(shard_state_dict, filename, metadata=metadata)
-            # Cleanup the data before next loop (important with offloading, so we don't blowup cpu RAM)
-            del shard_state_dict
+        del state_dict
         if index is None:
             path_to_weights = os.path.join(save_directory, weights_name)
             logger.info(f"Model weights saved in {path_to_weights}")
         else:
-            save_index_file = SAFE_WEIGHTS_INDEX_NAME
+            save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
             save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant))
             # Save the index as well
             with open(save_index_file, "w", encoding="utf-8") as f:
@@ -3574,9 +3501,10 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                     " desired `dtype` by passing the correct `dtype` argument."
                 )
-            if getattr(self, "is_loaded_in_8bit", False) and not is_bitsandbytes_available("0.48"):
+            if getattr(self, "is_loaded_in_8bit", False):
                 raise ValueError(
-                    "You need to install `pip install bitsandbytes>=0.48.0` if you want to move a 8-bit model across devices using to()."
+                    "`.to` is not supported for `8-bit` bitsandbytes models. Please use the model as it is, since the"
+                    " model has already been set to the correct devices and casted to the correct `dtype`."
                 )
         elif getattr(self, "quantization_method", None) == QuantizationMethod.GPTQ:
             if dtype_present_in_args:
@@ -3607,38 +3535,23 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             return super().float(*args)
     @classmethod
-    def get_init_context(cls, dtype: torch.dtype, is_quantized: bool, _is_ds_init_called: bool):
-        # Need to instantiate with correct dtype
-        init_contexts = [local_torch_dtype(dtype, cls.__name__)]
+    def get_init_context(cls, is_quantized: bool, _is_ds_init_called: bool):
         if is_deepspeed_zero3_enabled():
             import deepspeed
+            init_contexts = [no_init_weights()]
             # We cannot initialize the model on meta device with deepspeed when not quantized
             if not is_quantized and not _is_ds_init_called:
                 logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
-                init_contexts.extend(
-                    [
-                        init.no_init_weights(),
-                        deepspeed.zero.Init(config_dict_or_path=deepspeed_config()),
-                        set_zero3_state(),
-                    ]
-                )
+                init_contexts.extend([deepspeed.zero.Init(config_dict_or_path=deepspeed_config()), set_zero3_state()])
             elif is_quantized:
-                init_contexts.extend([torch.device("meta"), set_quantized_state()])
+                init_contexts.extend([init_empty_weights(), set_quantized_state()])
         else:
-            init_contexts.append(torch.device("meta"))
+            init_contexts = [no_init_weights(), init_empty_weights()]
         return init_contexts
-    def set_use_kernels(self, use_kernels, kernel_config: KernelConfig | None = None):
-        """
-        Set whether or not to use the `kernels` library to kernelize some layers of the model.
-        Args:
-            use_kernels (`bool`):
-                Whether or not to use the `kernels` library to kernelize some layers of the model.
-            kernel_config (`KernelConfig`, *optional*):
-                The kernel configuration to use to kernelize the model. If `None`, the default kernel mapping will be used.
-        """
+    def set_use_kernels(self, use_kernels, kernel_config):
         if use_kernels:
             if not is_kernels_available():
                 raise ValueError(
@@ -3659,9 +3572,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                 # This is a context manager to override the default kernel mapping
                 # We are calling kernelize inside this context manager using the use_kernels setter
-                # Param inherit_mapping should be False to avoid still loading kernel from remote
-                inherit_mapping = not kernel_config.use_local_kernel
-                with use_kernel_mapping(kernel_config.kernel_mapping, inherit_mapping=inherit_mapping):
+                with use_kernel_mapping(kernel_config.kernel_mapping):
                     self.use_kernels = True
             # We use the default kernel mapping in .integrations.hub_kernels
             else:
@@ -3670,18 +3581,19 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             self.use_kernels = False
     @classmethod
+    @restore_default_dtype
     def from_pretrained(
         cls: type[SpecificPreTrainedModelType],
-        pretrained_model_name_or_path: str | os.PathLike | None,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
         *model_args,
-        config: PreTrainedConfig | str | os.PathLike | None = None,
-        cache_dir: str | os.PathLike | None = None,
+        config: Optional[Union[PreTrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
         ignore_mismatched_sizes: bool = False,
         force_download: bool = False,
         local_files_only: bool = False,
-        token: str | bool | None = None,
+        token: Optional[Union[str, bool]] = None,
         revision: str = "main",
-        use_safetensors: bool | None = None,
+        use_safetensors: Optional[bool] = True,
         weights_only: bool = True,
         **kwargs,
     ) -> SpecificPreTrainedModelType:
@@ -3778,18 +3690,10 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                   "org/model@main"
                   "org/model:custom_kernel"
                   "org/model@v1.2.3:custom_kernel"
-            experts_implementation (`str`, *optional*):
-                The experts implementation to use in the model (if relevant). Can be any of:
-                - `"eager"` (sequential implementation of the experts matrix multiplications).
-                - `"batched_mm"` (using [`torch.bmm`](https://pytorch.org/docs/stable/generated/torch.bmm.html)).
-                - `"grouped_mm"` (using [`torch._grouped_mm`](https://docs.pytorch.org/docs/main/generated/torch.nn.functional.grouped_mm.html)).
-                By default, if available, `grouped_mm` will be used for torch>=2.9.0. The default is otherwise the sequential `"eager"` implementation.
             > Parameters for big model inference
-            dtype (`str` or `torch.dtype`, *optional*, defaults to `"auto"`):
+            dtype (`str` or `torch.dtype`, *optional*):
                 Override the default `torch_dtype` and load the model under a specific `dtype`. The different options
                 are:
@@ -3931,8 +3835,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         # For BC on torch_dtype argument
         if torch_dtype is not None:
             dtype = dtype if dtype is not None else torch_dtype
-        if dtype is None:
-            dtype = "auto"
         if is_offline_mode() and not local_files_only:
             local_files_only = True
@@ -4009,11 +3911,8 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         if "attn_implementation" in kwargs:
             config._attn_implementation = kwargs.pop("attn_implementation")
-        if "experts_implementation" in kwargs:
-            config._experts_implementation = kwargs.pop("experts_implementation")
-        hf_quantizer, config, device_map = get_hf_quantizer(
-            config, quantization_config, device_map, weights_only, user_agent
+        hf_quantizer, config, dtype, device_map = get_hf_quantizer(
+            config, quantization_config, dtype, device_map, weights_only, user_agent
         )
         if gguf_file:
@@ -4060,29 +3959,33 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             ]
         # Find the correct dtype based on current state
-        config, dtype = _get_dtype(
-            dtype, checkpoint_files, config, sharded_metadata, state_dict, weights_only, hf_quantizer
+        config, dtype, dtype_orig = _get_dtype(
+            cls, dtype, checkpoint_files, config, sharded_metadata, state_dict, weights_only
         )
         config.name_or_path = pretrained_model_name_or_path
-        model_init_context = cls.get_init_context(dtype, is_quantized, _is_ds_init_called)
+        model_init_context = cls.get_init_context(is_quantized, _is_ds_init_called)
         config = copy.deepcopy(config)  # We do not want to modify the config inplace in from_pretrained.
         with ContextManagers(model_init_context):
             # Let's make sure we don't run the init function of buffer modules
             model = cls(config, *model_args, **model_kwargs)
-            if hf_quantizer is not None:  # replace module with quantized modules (does not touch weights)
-                hf_quantizer.preprocess_model(
-                    model=model,
-                    dtype=dtype,
-                    device_map=device_map,
-                    checkpoint_files=checkpoint_files,
-                    use_kernels=use_kernels,
-                )
         # Obtain the weight conversion mapping for this model if any are registered
         weight_conversions = get_model_conversion_mapping(model, key_mapping, hf_quantizer)
+        # make sure we use the model's config since the __init__ call might have copied it
+        config = model.config
+        if hf_quantizer is not None:  # replace module with quantized modules (does not touch weights)
+            hf_quantizer.preprocess_model(
+                model=model,
+                device_map=device_map,
+                keep_in_fp32_modules=model._keep_in_fp32_modules,  # TODO prob no longer needed?
+                config=config,
+                checkpoint_files=checkpoint_files,
+                use_kernels=use_kernels,
+            )
         if _torch_distributed_available and device_mesh is not None:  # add hooks to nn.Modules: no weights
             model = distribute_model(model, tp_plan, distributed_config, device_mesh, tp_size)
@@ -4090,30 +3993,33 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         if device_map is not None:
             device_map = _get_device_map(model, device_map, max_memory, hf_quantizer)
+        # restore default dtype
+        if dtype_orig is not None:
+            torch.set_default_dtype(dtype_orig)
         # Finalize model weight initialization
-        load_config = LoadStateDictConfig(
-            pretrained_model_name_or_path=pretrained_model_name_or_path,
+        model, missing_keys, unexpected_keys, mismatched_keys, offload_index, error_msgs = cls._load_pretrained_model(
+            model,
+            state_dict,
+            checkpoint_files,
+            pretrained_model_name_or_path,
             ignore_mismatched_sizes=ignore_mismatched_sizes,
             sharded_metadata=sharded_metadata,
             device_map=device_map,
             disk_offload_folder=offload_folder,
-            offload_buffers=offload_buffers,
             dtype=dtype,
             hf_quantizer=hf_quantizer,
             device_mesh=device_mesh,
             weights_only=weights_only,
             weight_mapping=weight_conversions,
-            use_safetensors=use_safetensors,
-            download_kwargs=download_kwargs,
         )
-        load_info = cls._load_pretrained_model(model, state_dict, checkpoint_files, load_config)
-        load_info = cls._finalize_load_state_dict(model, load_config, load_info)
-        model.eval()  # Set model in evaluation mode to deactivate Dropout modules by default
+        model.eval()  # Set model in evaluation mode to deactivate DropOut modules by default
         model.set_use_kernels(use_kernels, kernel_config)
         # If it is a model with generation capabilities, attempt to load generation files (generation config,
         # custom generate function)
-        if model.can_generate() and hasattr(model, "adjust_generation_fn") and not gguf_file:
+        if model.can_generate() and hasattr(model, "adjust_generation_fn"):
             model.adjust_generation_fn(
                 generation_config,
                 from_auto_class,
@@ -4124,34 +4030,29 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                 **kwargs,
             )
-        # If the device_map has more than 1 device: dispatch model with hooks on all devices
-        if device_map is not None and len(set(device_map.values())) > 1:
-            accelerate_dispatch(
-                model, hf_quantizer, device_map, offload_folder, load_info.disk_offload_index, offload_buffers
-            )
+        # for device_map="auto" : dispatch model with hooks on all devices if necessary
+        if device_map is not None and device_mesh is None:
+            accelerate_dispatch(model, hf_quantizer, device_map, offload_folder, offload_index, offload_buffers)
         if hf_quantizer is not None:
             model.hf_quantizer = hf_quantizer
-            hf_quantizer.postprocess_model(
-                model
-            )  # usually a no-op but sometimes needed, e.g to remove the quant config when dequantizing
+            hf_quantizer.postprocess_model(model, config=config)  # usually a no-op but sometimes needed
         if _adapter_model_path is not None:
-            if token is not None:
-                adapter_kwargs["token"] = token
-            load_info = model.load_adapter(
+            adapter_kwargs["key_mapping"] = weight_conversions  # TODO: Dynamic weight loader for adapters
+            model.load_adapter(
                 _adapter_model_path,
                 adapter_name=adapter_name,
-                load_config=load_config,
+                token=token,
                 adapter_kwargs=adapter_kwargs,
             )
         if output_loading_info:
             loading_info = {
-                "missing_keys": load_info.missing_keys,
-                "unexpected_keys": load_info.unexpected_keys,
-                "mismatched_keys": load_info.mismatched_keys,
-                "error_msgs": load_info.error_msgs,
+                "missing_keys": missing_keys,
+                "unexpected_keys": unexpected_keys,
+                "mismatched_keys": mismatched_keys,
+                "error_msgs": error_msgs,
             }
             return model, loading_info
         return model
@@ -4160,65 +4061,74 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
     def _load_pretrained_model(
         cls,
         model: "PreTrainedModel",
-        state_dict: dict | None,
-        checkpoint_files: list[str] | None,
-        load_config: LoadStateDictConfig,
-    ) -> LoadStateDictInfo:
-        is_quantized = load_config.is_quantized
-        is_hqq_or_quark = is_quantized and load_config.hf_quantizer.quantization_config.quant_method in {
+        state_dict: Optional[dict],
+        checkpoint_files: Optional[list[str]],
+        pretrained_model_name_or_path: Optional[str],
+        ignore_mismatched_sizes: bool = False,
+        sharded_metadata: Optional[dict] = None,
+        device_map: Optional[dict] = None,
+        disk_offload_folder: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        hf_quantizer: Optional[HfQuantizer] = None,
+        device_mesh: Optional["torch.distributed.device_mesh.DeviceMesh"] = None,
+        weights_only: bool = True,
+        weight_mapping: Optional[Sequence[WeightConverter | WeightRenaming]] = None,
+    ):
+        is_quantized = hf_quantizer is not None
+        is_hqq_or_quark = is_quantized and hf_quantizer.quantization_config.quant_method in {
             QuantizationMethod.HQQ,
             QuantizationMethod.QUARK,
         }
         # Model's definition arriving here is final (TP hooks added, quantized layers replaces)
         expected_keys = list(model.state_dict().keys())
         if logger.level >= logging.WARNING:
             verify_tp_plan(expected_keys, getattr(model, "_tp_plan", None))
         # This offload index if for params explicitly on the "disk" in the device_map
         disk_offload_index = None
         # Prepare parameters offloading if needed
-        if load_config.device_map is not None and "disk" in load_config.device_map.values():
+        if device_map is not None and "disk" in device_map.values():
             disk_offload_index = accelerate_disk_offload(
-                model,
-                load_config.disk_offload_folder,
+                disk_offload_folder,
                 checkpoint_files,
-                load_config.device_map,
-                load_config.sharded_metadata,
-                load_config.dtype,
-                load_config.weight_mapping,
+                device_map,
+                expected_keys,
+                sharded_metadata,
+                dtype,
+                weight_mapping,
             )
         # Warmup cuda to load the weights much faster on devices
-        if load_config.device_map is not None and not is_hqq_or_quark:
-            expanded_device_map = expand_device_map(load_config.device_map, expected_keys)
-            caching_allocator_warmup(model, expanded_device_map, load_config.hf_quantizer)
+        if device_map is not None and not is_hqq_or_quark:
+            expanded_device_map = expand_device_map(device_map, expected_keys)
+            caching_allocator_warmup(model, expanded_device_map, hf_quantizer)
+        tp_plan = getattr(model, "_tp_plan", None)
         error_msgs = []
         if is_deepspeed_zero3_enabled() and not is_quantized:
             if state_dict is None:
                 merged_state_dict = {}
                 for ckpt_file in checkpoint_files:
-                    merged_state_dict.update(
-                        load_state_dict(ckpt_file, map_location="cpu", weights_only=load_config.weights_only)
-                    )
+                    merged_state_dict.update(load_state_dict(ckpt_file, map_location="cpu", weights_only=weights_only))
                 state_dict = merged_state_dict
-            error_msgs, missing_keys = _load_state_dict_into_zero3_model(model, state_dict, load_config)
+            error_msgs += _load_state_dict_into_zero3_model(model, state_dict)
             # This is not true but for now we assume only best-case scenario with deepspeed, i.e. perfectly matching checkpoints
-            unexpected_keys, mismatched_keys, conversion_errors = set(), set(), set()
+            missing_keys, unexpected_keys, mismatched_keys, misc = set(), set(), set(), set()
         else:
             all_pointer = set()
-            if state_dict is not None:
-                merged_state_dict = state_dict
-            elif checkpoint_files is not None and checkpoint_files[0].endswith(".safetensors") and state_dict is None:
+            # Checkpoints are safetensors
+            if checkpoint_files is not None and checkpoint_files[0].endswith(".safetensors"):
                 merged_state_dict = {}
                 for file in checkpoint_files:
                     file_pointer = safe_open(file, framework="pt", device="cpu")
                     all_pointer.add(file_pointer)
                     for k in file_pointer.keys():
                         merged_state_dict[k] = file_pointer.get_slice(k)  # don't materialize yet
+            # User passed an explicit state_dict
+            elif state_dict is not None:
+                merged_state_dict = state_dict
             # Checkpoints are .bin
             elif checkpoint_files is not None:
                 merged_state_dict = {}
@@ -4227,14 +4137,19 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             else:
                 raise ValueError("Neither a state dict nor checkpoint files were found.")
-            missing_keys, unexpected_keys, mismatched_keys, disk_offload_index, conversion_errors = (
+            missing_keys, unexpected_keys, mismatched_keys, disk_offload_index, misc = (
                 convert_and_load_state_dict_in_model(
-                    model=model,
-                    state_dict=merged_state_dict,
-                    load_config=load_config,
-                    tp_plan=model._tp_plan,
-                    dtype_plan=model.dtype_plan,
-                    disk_offload_index=disk_offload_index,
+                    model,
+                    merged_state_dict,
+                    weight_mapping,
+                    tp_plan,
+                    hf_quantizer,
+                    dtype,
+                    device_map,
+                    model.dtype_plan,
+                    device_mesh,
+                    disk_offload_index,
+                    disk_offload_folder,
                 )
             )
@@ -4242,58 +4157,65 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             for k in all_pointer:
                 k.__exit__(None, None, None)
-        return LoadStateDictInfo(
-            missing_keys=missing_keys,
-            unexpected_keys=unexpected_keys,
-            mismatched_keys=mismatched_keys,
-            disk_offload_index=disk_offload_index,
-            error_msgs=error_msgs,
-            conversion_errors=conversion_errors,
-        )
-    @staticmethod
-    def _finalize_load_state_dict(
-        model,
-        load_config: LoadStateDictConfig,
-        load_info: LoadStateDictInfo,
-    ) -> LoadStateDictInfo:
-        # TODO @ArthurZucker this will be in a separate function to allows people not to run this
-        # for more granularity
         # Marks tied weights as `_is_hf_initialized` to avoid initializing them (it's very important for efficiency)
         model.mark_tied_weights_as_initialized()
-        # Move missing (and potentially mismatched) keys and non-persistent buffers back to their expected device from
-        # meta device (because they were not moved when loading the weights as they were not in the loaded state dict)
-        missing_and_mismatched = load_info.missing_keys | {k[0] for k in load_info.mismatched_keys}
-        model._move_missing_keys_from_meta_to_device(
-            missing_and_mismatched, load_config.device_map, load_config.device_mesh, load_config.hf_quantizer
-        )
+        # Move missing (and potentially mismatched) keys back to cpu from meta device (because they won't be moved when
+        # loading the weights as they are not in the loaded state dict)
+        miss_and_mismatched = missing_keys | {k[0] for k in mismatched_keys}
+        model._move_missing_keys_from_meta_to_cpu(miss_and_mismatched, dtype, hf_quantizer)
-        # Correctly initialize the missing (and potentially mismatched) keys (all parameters without the `_is_hf_initialized` flag)
-        model._initialize_missing_keys(load_config.is_quantized)
+        # Correctly initialize the missing (and potentially mismatched) keys (all parameters without the `_is_hf_initialzed` flag)
+        model._initialize_missing_keys(is_quantized)
         # Tie the weights
-        model.tie_weights(missing_keys=load_info.missing_keys, recompute_mapping=False)
+        model.tie_weights(missing_keys=missing_keys, recompute_mapping=False)
         # Adjust missing and unexpected keys
-        missing_keys, unexpected_keys = model._adjust_missing_and_unexpected_keys(
-            load_info.missing_keys, load_info.unexpected_keys
-        )
+        missing_keys, unexpected_keys = model._adjust_missing_and_unexpected_keys(missing_keys, unexpected_keys)
+        # Post-processing for tensor parallelism
+        if device_mesh is not None:
+            # When using TP, the device map is a single device for all parameters
+            tp_device = list(device_map.values())[0]
+            # This is needed for the RotaryEmbedding, which was not initialized on the correct device as it is
+            # not part of the state_dict (persistent=False)
+            for buffer in model.buffers():  # TODO to avaoid this buffer could be added to the ckpt
+                if buffer.device != tp_device:
+                    buffer.data = buffer.to(tp_device)
+            # In this case, the top-most task module weights were not moved to device and parallelized as they
+            # were not part of the loaded weights: do it now
+            if missing_keys:
+                state_dict = model.state_dict()
+                for name in missing_keys:
+                    param = state_dict[name]
+                    # Shard the param
+                    shard_and_distribute_module(
+                        model,
+                        param.to(tp_device),
+                        param,
+                        name,
+                        None,
+                        False,
+                        device_mesh.get_local_rank(),
+                        device_mesh,
+                    )
         log_state_dict_report(
             model=model,
-            load_config=load_config,
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
             logger=logger,
-            error_msgs=load_info.error_msgs,
+            error_msgs=error_msgs,
             unexpected_keys=unexpected_keys,
             missing_keys=missing_keys,
-            mismatched_keys=load_info.mismatched_keys,
-            mismatched_shapes=load_info.mismatched_keys,
-            conversion_errors=load_info.conversion_errors,
+            mismatched_keys=mismatched_keys,
+            mismatched_shapes=mismatched_keys,
+            misc=misc,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
         )
-        return replace(load_info, missing_keys=missing_keys, unexpected_keys=unexpected_keys)
+        return model, missing_keys, unexpected_keys, mismatched_keys, disk_offload_index, error_msgs
     def retrieve_modules_from_names(self, names, add_prefix=False, remove_prefix=False):
         module_keys = {".".join(key.split(".")[:-1]) for key in names}
@@ -4362,17 +4284,15 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
             # If the pad token is equal to either BOS, EOS, or SEP, we do not know whether the user should use an
             # attention_mask or not. In this case, we should still show a warning because this is a rare case.
-            # NOTE: `sep_token_id` is not used in all models and it can be absent in the config
-            sep_token_id = getattr(self.config, "sep_token_id", None)
             if (
                 (self.config.bos_token_id is not None and self.config.bos_token_id == self.config.pad_token_id)
                 or (self.config.eos_token_id is not None and self.config.eos_token_id == self.config.pad_token_id)
-                or (sep_token_id is not None and sep_token_id == self.config.pad_token_id)
+                or (self.config.sep_token_id is not None and self.config.sep_token_id == self.config.pad_token_id)
             ):
                 warn_string += (
                     f"\nYou may ignore this warning if your `pad_token_id` ({self.config.pad_token_id}) is identical "
                     f"to the `bos_token_id` ({self.config.bos_token_id}), `eos_token_id` ({self.config.eos_token_id}), "
-                    f"or the `sep_token_id` ({sep_token_id}), and your input is not padded."
+                    f"or the `sep_token_id` ({self.config.sep_token_id}), and your input is not padded."
                 )
             logger.warning_once(warn_string)
@@ -4457,7 +4377,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
                 )
             self._use_kernels = False
-    def get_compiled_call(self, compile_config: CompileConfig | None) -> Callable:
+    def get_compiled_call(self, compile_config: Optional[CompileConfig]) -> Callable:
         """Return a `torch.compile`'d version of `self.__call__`. This is useful to dynamically choose between
         non-compiled/compiled `forward` during inference, especially to switch between prefill (where we don't
         want to use compiled version to avoid recomputing the graph with new shapes) and iterative decoding
@@ -4479,54 +4399,33 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
     def is_backend_compatible(cls):
         return cls._supports_attention_backend
-    def _move_missing_keys_from_meta_to_device(
-        self,
-        missing_keys: list[str],
-        device_map: dict | None,
-        device_mesh: "torch.distributed.device_mesh.DeviceMesh | None",
-        hf_quantizer: HfQuantizer | None,
+    def _move_missing_keys_from_meta_to_cpu(
+        self, missing_keys: list[str], dtype: torch.dtype, hf_quantizer: Optional[HfQuantizer]
     ) -> None:
-        """Move the missing keys (keys that are part of the model parameters, but were NOT found in the loaded state dicts)
-        back from meta device to their device according to the `device_map` if any, else cpu. Takes care of sharding those
-        missing parameters if `device_mesh` is provided, i.e. we are using TP.
-        All non-persistent buffers are also moved back to the correct device (they are not part of the state_dict, but are
-        not missing either).
+        """Move the missing keys (keys that are part of the model parameters, but were NOT found in the loaded state dicts) back
+        from meta device to cpu.
         """
         is_quantized = hf_quantizer is not None
-        # This is the only case where we do not initialize the model on meta device, so we don't have to do anything here
-        if is_deepspeed_zero3_enabled() and not is_quantized:
-            return
         # In this case we need to move everything back
         if is_fsdp_enabled() and not is_local_dist_rank_0() and not is_quantized:
+            # We only do it for the parameters, as the buffers are not initialized on the meta device by default
             for key, param in self.named_parameters():
-                value = torch.empty_like(param, device="cpu")
-                _load_parameter_into_model(self, key, value)
-            for key, buffer in self.named_buffers():
-                value = torch.empty_like(buffer, device="cpu")
+                value = torch.empty_like(param, dtype=dtype, device="cpu")
                 _load_parameter_into_model(self, key, value)
             return
+        model_state_dict = self.state_dict()
         # The tied weight keys are in the "missing" usually, but they should not be moved (they will be tied anyway)
         # This is especially important because if they are moved, they will lose the `_is_hf_initialized` flag, and they
         # will be re-initialized for nothing (which can be quite long)
         for key in missing_keys - self.all_tied_weights_keys.keys():
-            param = self.get_parameter_or_buffer(key)
-            param_device = get_device(device_map, key, valid_torch_device=True)
-            value = torch.empty_like(param, device=param_device)
-            # For TP, we may need to shard the param
-            if device_mesh is not None:
-                shard_and_distribute_module(
-                    self, value, param, key, None, False, device_mesh.get_local_rank(), device_mesh
-                )
-            # Otherwise, just move it to device
-            else:
-                _load_parameter_into_model(self, key, value)
-        # We need to move back non-persistent buffers as well, as they are not part of loaded weights anyway
-        for key, buffer in self.named_non_persistent_buffers():
-            buffer_device = get_device(device_map, key, valid_torch_device=True)
-            value = torch.empty_like(buffer, device=buffer_device)
-            _load_parameter_into_model(self, key, value)
+            param = model_state_dict[key]
+            # Buffers are not initialized on the meta device, so we still need this check to avoid overwriting them
+            if param.device == torch.device("meta"):
+                value = torch.empty_like(param, dtype=dtype, device="cpu")
+                if not is_quantized or not hf_quantizer.param_needs_quantization(self, key):
+                    _load_parameter_into_model(self, key, value)
     def _initialize_missing_keys(self, is_quantized: bool) -> None:
         """
@@ -4554,6 +4453,8 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
     ) -> tuple[set[str], set[str]]:
         """Adjust the `missing_keys` and `unexpected_keys` based on current model's exception rules, to avoid
         raising unneeded warnings/errors.
+        Also, set the `_is_hf_initialized` on tied weight keys, to avoid initializing them as they are going to
+        be tied anyway.
         """
         # Old checkpoints may have keys for rotary_emb.inv_freq forach layer, however we moved this buffer to the main model
         # (so the buffer name has changed). Remove them in such a case. This is another exception that was not added to
@@ -4612,19 +4513,6 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
         raise AttributeError(f"`{target}` is neither a parameter, buffer, nor extra state.")
-    def named_non_persistent_buffers(
-        self, recurse: bool = True, remove_duplicate: bool = True
-    ) -> Iterator[tuple[str, torch.Tensor]]:
-        """Similar to `named_buffers`, but only yield non-persistent ones. It is handy as it's not perfectly straightforward
-        to know if they are persistent or not"""
-        for name, tensor in self.named_buffers(recurse=recurse, remove_duplicate=remove_duplicate):
-            # We have to grab the parent here, as the attribute `_non_persistent_buffers_set` is on the immediate
-            # parent only
-            parent, buf_name = name.rsplit(".", 1) if "." in name else ("", name)
-            parent = self.get_submodule(parent)
-            if buf_name in parent._non_persistent_buffers_set:
-                yield name, tensor
     def train(self, mode: bool = True):
         out = super().train(mode)
         if self.use_kernels:
@@ -4667,7 +4555,7 @@ def unwrap_model(model: nn.Module, recursive: bool = False) -> nn.Module:
             return model
-def is_accelerator_device(device: str | int | torch.device) -> bool:
+def is_accelerator_device(device: Union[str, int, torch.device]) -> bool:
     """Check if the device is an accelerator. We need to function, as device_map can be "disk" as well, which is not
     a proper `torch.device`.
     """
@@ -4677,41 +4565,7 @@ def is_accelerator_device(device: str | int | torch.device) -> bool:
         return torch.device(device).type not in ["meta", "cpu"]
-def get_total_byte_count(
-    model: PreTrainedModel, accelerator_device_map: dict, hf_quantizer: HfQuantizer | None = None
-):
-    """
-    This utility function calculates the total bytes count needed to load the model on each device.
-    This is useful for caching_allocator_warmup as we want to know how much cache we need to pre-allocate.
-    """
-    total_byte_count = defaultdict(lambda: 0)
-    tied_param_names = model.all_tied_weights_keys.keys()
-    tp_plan = model._tp_plan if torch.distributed.is_available() and torch.distributed.is_initialized() else []
-    for param_name, device in accelerator_device_map.items():
-        # Skip if the parameter has already been accounted for (tied weights)
-        if param_name in tied_param_names:
-            continue
-        param = model.get_parameter_or_buffer(param_name)
-        if hf_quantizer is not None:
-            dtype_size = hf_quantizer.param_element_size(model, param_name, param)
-        else:
-            dtype_size = param.element_size()
-        param_byte_count = param.numel() * dtype_size
-        if len(tp_plan) > 0:
-            is_part_of_plan = _get_parameter_tp_plan(param_name, tp_plan, is_weight=True) is not None
-            param_byte_count //= torch.distributed.get_world_size() if is_part_of_plan else 1
-        total_byte_count[device] += param_byte_count
-    return total_byte_count
-def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict, hf_quantizer: HfQuantizer | None):
+def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict, hf_quantizer: Optional[HfQuantizer]):
     """This function warm-ups the caching allocator based on the size of the model tensors that will reside on each
     device. It allows to have one large call to Malloc, instead of recursively calling it later when loading
     the model, which is actually the loading speed bottleneck.
@@ -4730,6 +4584,8 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict,
     - Loading speed bottleneck is now almost only tensor copy (i.e. changing the dtype) and moving the tensors to the devices.
     However, we cannot really improve on those aspects obviously, as the data needs to be moved/copied in the end.
     """
+    factor = 2 if hf_quantizer is None else hf_quantizer.get_accelerator_warm_up_factor()
     # Remove disk, cpu and meta devices, and cast to proper torch.device
     accelerator_device_map = {
         param: torch.device(device) for param, device in expanded_device_map.items() if is_accelerator_device(device)
@@ -4737,7 +4593,40 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict,
     if not accelerator_device_map:
         return
-    total_byte_count = get_total_byte_count(model, accelerator_device_map, hf_quantizer)
+    tp_plan = getattr(model, "_tp_plan", []) or []
+    tp_plan_regex = (
+        re.compile("|".join([re.escape(plan) for plan in tp_plan]))
+        if _torch_distributed_available and torch.distributed.is_initialized()
+        else None
+    )
+    total_byte_count = defaultdict(lambda: 0)
+    tied_param_names = model.all_tied_weights_keys.keys()
+    for param_name, device in accelerator_device_map.items():
+        # Skip if the parameter has already been accounted for (tied weights)
+        if param_name in tied_param_names:
+            continue
+        # For example in the case of MXFP4 quantization, we need to update the param name to the original param name
+        # because the checkpoint contains blocks, and scales, but since we are dequantizing, we need to use the original param name
+        if hf_quantizer is not None:
+            param_name = hf_quantizer.get_param_name(param_name)
+        try:
+            param = model.get_parameter_or_buffer(param_name)
+        except AttributeError:
+            # TODO: for now let's skip if we can't find the parameters
+            if hf_quantizer is not None:
+                continue
+            raise AttributeError(f"Parameter {param_name} not found in model")
+        # The dtype of different parameters may be different with composite models or `keep_in_fp32_modules`
+        param_byte_count = param.numel() * param.element_size()
+        if tp_plan_regex is not None:
+            generic_name = re.sub(r"\.\d+\.", ".*.", param_name)
+            param_byte_count //= torch.distributed.get_world_size() if tp_plan_regex.search(generic_name) else 1
+        total_byte_count[device] += param_byte_count
     # This will kick off the caching allocator to avoid having to Malloc afterwards
     for device, byte_count in total_byte_count.items():
@@ -4757,9 +4646,9 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict,
             unused_memory = torch_accelerator_module.memory_reserved(
                 index
             ) - torch_accelerator_module.memory_allocated(index)
-            byte_count = int(max(0, byte_count - unused_memory))
-        # We divide by 2 here as we allocate in fp16
-        _ = torch.empty(byte_count // 2, dtype=torch.float16, device=device, requires_grad=False)
+            byte_count = max(0, byte_count - unused_memory)
+        # Allocate memory
+        _ = torch.empty(byte_count // factor, dtype=torch.float16, device=device, requires_grad=False)
 class AttentionInterface(GeneralInterface):

transformers 5.0.0__py3-none-any.whl → 5.0.0rc0__py3-none-any.whl

transformers 5.0.0py3-none-any.whl → 5.0.0rc0py3-none-any.whl