transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -18,9 +18,10 @@ from __future__ import annotations
|
|
|
18
18
|
import math
|
|
19
19
|
import os
|
|
20
20
|
import re
|
|
21
|
+
import traceback
|
|
21
22
|
from abc import abstractmethod
|
|
22
23
|
from collections import defaultdict
|
|
23
|
-
from collections.abc import Callable
|
|
24
|
+
from collections.abc import Callable
|
|
24
25
|
from concurrent.futures import Future, ThreadPoolExecutor
|
|
25
26
|
from contextlib import contextmanager
|
|
26
27
|
from copy import deepcopy
|
|
@@ -32,17 +33,15 @@ import torch
|
|
|
32
33
|
|
|
33
34
|
from .integrations.accelerate import get_device, offload_weight
|
|
34
35
|
from .integrations.tensor_parallel import ALL_PARALLEL_STYLES
|
|
35
|
-
from .utils import is_env_variable_true,
|
|
36
|
+
from .utils import is_env_variable_true, logging
|
|
37
|
+
from .utils.loading_report import LoadStateDictInfo
|
|
36
38
|
|
|
37
39
|
|
|
38
40
|
_torch_distributed_available = torch.distributed.is_available()
|
|
39
|
-
_is_dtensor_available = _torch_distributed_available and is_torch_greater_or_equal("2.5")
|
|
40
|
-
if _is_dtensor_available:
|
|
41
|
-
from torch.distributed.tensor import DTensor, Replicate
|
|
42
41
|
|
|
43
42
|
if TYPE_CHECKING:
|
|
44
43
|
from .integrations.tensor_parallel import TensorParallelLayer
|
|
45
|
-
from .modeling_utils import PreTrainedModel
|
|
44
|
+
from .modeling_utils import LoadStateDictConfig, PreTrainedModel
|
|
46
45
|
from .quantizers import HfQuantizer
|
|
47
46
|
|
|
48
47
|
|
|
@@ -113,12 +112,12 @@ class Chunk(ConversionOps):
|
|
|
113
112
|
) -> dict[str, torch.Tensor]:
|
|
114
113
|
tensors = next(iter(input_dict.values()))
|
|
115
114
|
tensor = tensors[0] if isinstance(tensors, list) else tensors
|
|
116
|
-
targets = self.
|
|
115
|
+
targets = self.get_target_patterns(input_dict, target_patterns)
|
|
117
116
|
sizes = len(targets)
|
|
118
117
|
chunks = torch.chunk(tensor, sizes, dim=self.dim)
|
|
119
118
|
return dict(zip(targets, chunks))
|
|
120
119
|
|
|
121
|
-
def
|
|
120
|
+
def get_target_patterns(self, input_dict: dict, target_patterns: list[str]) -> list[str]:
|
|
122
121
|
# Here we always return the target patterns
|
|
123
122
|
if len(input_dict) > 1 or len(target_patterns) == 1:
|
|
124
123
|
raise ValueError("Undefined Operation encountered!")
|
|
@@ -245,6 +244,44 @@ class SplitModulelist(ConversionOps):
|
|
|
245
244
|
return MergeModulelist(self.dim)
|
|
246
245
|
|
|
247
246
|
|
|
247
|
+
class Transpose(ConversionOps):
|
|
248
|
+
"""
|
|
249
|
+
Transposes the given tensor along dim0 and dim1.
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
def __init__(self, dim0: int = 0, dim1: int = 1):
|
|
253
|
+
self.dim0 = dim0
|
|
254
|
+
self.dim1 = dim1
|
|
255
|
+
|
|
256
|
+
@torch.no_grad
|
|
257
|
+
def convert(
|
|
258
|
+
self, input_dict: dict[str, torch.Tensor], source_patterns: list[str], target_patterns: list[str], **kwargs
|
|
259
|
+
) -> dict[str, torch.Tensor]:
|
|
260
|
+
target_pattern = self.get_target_pattern(input_dict, source_patterns, target_patterns)
|
|
261
|
+
tensors = next(iter(input_dict.values()))
|
|
262
|
+
tensor = tensors[0] if isinstance(tensors, list) else tensors
|
|
263
|
+
return {target_pattern: torch.transpose(tensor, dim0=self.dim0, dim1=self.dim1).contiguous()}
|
|
264
|
+
|
|
265
|
+
def get_target_pattern(
|
|
266
|
+
self, input_dict: dict[str, torch.Tensor], source_patterns: list[str], target_patterns: list[str]
|
|
267
|
+
) -> str:
|
|
268
|
+
if len(input_dict) != 1:
|
|
269
|
+
raise ValueError("Undefined Operation encountered!")
|
|
270
|
+
# Here it's the first operation of a chain, so return the source
|
|
271
|
+
if len(target_patterns) > 1:
|
|
272
|
+
if len(source_patterns) == 1:
|
|
273
|
+
return source_patterns[0]
|
|
274
|
+
else:
|
|
275
|
+
raise ValueError("Undefined Operation encountered!")
|
|
276
|
+
# Here it's the only operation, or the last operation in a chain, so we return the target
|
|
277
|
+
else:
|
|
278
|
+
return target_patterns[0]
|
|
279
|
+
|
|
280
|
+
@property
|
|
281
|
+
def reverse_op(self) -> ConversionOps:
|
|
282
|
+
return Transpose(dim0=self.dim1, dim1=self.dim0)
|
|
283
|
+
|
|
284
|
+
|
|
248
285
|
class PermuteForRope(ConversionOps):
|
|
249
286
|
"""
|
|
250
287
|
Applies the permutation required to convert complex RoPE weights to the split sin/cos format.
|
|
@@ -402,41 +439,72 @@ class ErnieSplitAndDecoupleTextVisionExperts(ConversionOps):
|
|
|
402
439
|
return ErnieFuseAndSplitTextVisionExperts(stack_dim=self.stack_dim, concat_dim=self.concat_dim)
|
|
403
440
|
|
|
404
441
|
|
|
405
|
-
class
|
|
442
|
+
class Force16BytesAlignment(ConversionOps):
|
|
406
443
|
"""
|
|
407
|
-
|
|
444
|
+
Ensures that the given tensor is 16-bytes aligned in memory and clones it if not.
|
|
445
|
+
This garantees 16-bytes alignmenet for kernels / implementations that use TMA or SIMD instructions like torch._grouped_mm.
|
|
408
446
|
"""
|
|
409
447
|
|
|
410
|
-
def __init__(self, dim0: int = 0, dim1: int = 1):
|
|
411
|
-
self.dim0 = dim0
|
|
412
|
-
self.dim1 = dim1
|
|
413
|
-
|
|
414
448
|
@torch.no_grad()
|
|
415
449
|
def convert(
|
|
416
|
-
self,
|
|
417
|
-
|
|
418
|
-
source_patterns
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
if len(input_dict) != len(target_patterns):
|
|
424
|
-
raise ValueError(
|
|
425
|
-
f"Transpose conversion can only happen on each key ({len(input_dict)}) "
|
|
426
|
-
f"and should match exact one target ({len(target_patterns)})."
|
|
427
|
-
)
|
|
450
|
+
self, input_dict: dict[str, torch.Tensor], source_patterns: list[str], target_patterns: list[str], **kwargs
|
|
451
|
+
) -> dict[str, torch.Tensor]:
|
|
452
|
+
target_pattern = self.get_target_pattern(input_dict, source_patterns, target_patterns)
|
|
453
|
+
tensors = next(iter(input_dict.values()))
|
|
454
|
+
tensor = tensors[0] if isinstance(tensors, list) else tensors
|
|
455
|
+
tensor = tensor.clone() if tensor.data_ptr() % 16 != 0 else tensor
|
|
456
|
+
return {target_pattern: tensor}
|
|
428
457
|
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
458
|
+
def get_target_pattern(
|
|
459
|
+
self, input_dict: dict[str, torch.Tensor], source_patterns: list[str], target_patterns: list[str]
|
|
460
|
+
) -> str:
|
|
461
|
+
if len(input_dict) != 1:
|
|
462
|
+
raise ValueError("Undefined Operation encountered!")
|
|
463
|
+
# Here it's the first operation of a chain, so return the source
|
|
464
|
+
if len(target_patterns) > 1:
|
|
465
|
+
if len(source_patterns) == 1:
|
|
466
|
+
return source_patterns[0]
|
|
467
|
+
else:
|
|
468
|
+
raise ValueError("Undefined Operation encountered!")
|
|
469
|
+
# Here it's the only operation, or the last operation in a chain, so we return the target
|
|
470
|
+
else:
|
|
471
|
+
return target_patterns[0]
|
|
436
472
|
|
|
437
473
|
@property
|
|
438
474
|
def reverse_op(self) -> ConversionOps:
|
|
439
|
-
return
|
|
475
|
+
return Force16BytesAlignment()
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def process_target_pattern(pattern: str) -> tuple[str, str | None]:
|
|
479
|
+
"""
|
|
480
|
+
Process a target pattern for reverse mapping (when targets become sources).
|
|
481
|
+
|
|
482
|
+
This handles several edge cases in checkpoint conversion mappings:
|
|
483
|
+
- Removes `^` prefix and `$` suffix (start/end of string anchors)
|
|
484
|
+
- Removes negative lookahead/lookbehind assertions
|
|
485
|
+
- Detects capturing groups and replaces them with `\\1` backreference
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
pattern: The target pattern to process for reverse mapping.
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
A tuple of (processed_pattern, captured_group) where captured_group is
|
|
492
|
+
the original capturing group found (e.g., "(encoder|decoder)") or None.
|
|
493
|
+
"""
|
|
494
|
+
# Some mapping contains `^` to notify start of string when matching -> remove it during reverse mapping
|
|
495
|
+
pattern = pattern.removeprefix("^")
|
|
496
|
+
# Some mapping contains `$` to notify end of string when matching -> remove it during reverse mapping
|
|
497
|
+
pattern = pattern.removesuffix("$")
|
|
498
|
+
# Remove negative lookahead/behind if any. This is ugly but needed for reverse mapping of
|
|
499
|
+
# Qwen2.5, Sam3, Ernie4.5 VL MoE!
|
|
500
|
+
pattern = re.sub(r"\(\?.+\)", "", pattern)
|
|
501
|
+
# Allow capturing groups in patterns, i.e. to add/remove a prefix to all keys (e.g. timm_wrapper, sam3)
|
|
502
|
+
capturing_group_match = re.search(r"\(.+?\)", pattern)
|
|
503
|
+
captured_group = None
|
|
504
|
+
if capturing_group_match:
|
|
505
|
+
captured_group = capturing_group_match.group(0)
|
|
506
|
+
pattern = pattern.replace(captured_group, r"\1", 1)
|
|
507
|
+
return pattern, captured_group
|
|
440
508
|
|
|
441
509
|
|
|
442
510
|
@dataclass(slots=True)
|
|
@@ -451,32 +519,50 @@ class WeightTransform:
|
|
|
451
519
|
collected_tensors: dict[str, list[Future]] = field(default_factory=lambda: defaultdict(list), init=False)
|
|
452
520
|
layer_targets: dict[str, set[str]] = field(default_factory=lambda: defaultdict(set), init=False)
|
|
453
521
|
|
|
454
|
-
def
|
|
455
|
-
if
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
self
|
|
522
|
+
def __setattr__(self, name, value):
|
|
523
|
+
if name in ("source_patterns", "target_patterns"):
|
|
524
|
+
# We do not allow to re-set the patterns, as they are linked between each other and changing one
|
|
525
|
+
# without the other can mess-up with the capturing groups/compiled sources
|
|
526
|
+
if hasattr(self, name):
|
|
527
|
+
raise ValueError(f"Cannot assign to field {name}, you should create a new instance")
|
|
528
|
+
# Switch str to list
|
|
529
|
+
elif isinstance(value, str):
|
|
530
|
+
value = [value]
|
|
531
|
+
object.__setattr__(self, name, value)
|
|
459
532
|
|
|
533
|
+
def __post_init__(self):
|
|
460
534
|
# Due to how our `_checkpoint_conversion_mapping` mappings are written, we need a few exceptions here
|
|
461
535
|
# when instantiating the reverse mapping (i.e. the targets become sources, and sources become targets)
|
|
462
536
|
# The issues lie in the sources usually, so here we need to check the targets for the reversed mapping
|
|
537
|
+
|
|
538
|
+
# Process target_patterns: detect capturing groups and replace with \1
|
|
539
|
+
# Store the original capturing group patterns for reverse mapping
|
|
540
|
+
target_capturing_groups: list[str] = []
|
|
463
541
|
for i, pattern in enumerate(self.target_patterns):
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
542
|
+
self.target_patterns[i], captured_group = process_target_pattern(pattern)
|
|
543
|
+
if captured_group is not None:
|
|
544
|
+
target_capturing_groups.append(captured_group)
|
|
545
|
+
|
|
546
|
+
# Validate that we only have one unique capturing group pattern across all targets
|
|
547
|
+
# This ensures deterministic reverse mapping when sources have \1 backreferences
|
|
548
|
+
unique_capturing_groups = set(target_capturing_groups)
|
|
549
|
+
if len(unique_capturing_groups) > 1:
|
|
550
|
+
raise ValueError(
|
|
551
|
+
f"Multiple different capturing groups found in target_patterns: {unique_capturing_groups}. "
|
|
552
|
+
f"All target patterns must use the same capturing group pattern."
|
|
553
|
+
)
|
|
554
|
+
unique_capturing_group = unique_capturing_groups.pop() if unique_capturing_groups else None
|
|
475
555
|
|
|
476
556
|
# We also need to check capturing groups in the sources during reverse mapping (e.g. timm_wrapper, sam3)
|
|
477
557
|
for i, pattern in enumerate(self.source_patterns):
|
|
478
558
|
if r"\1" in pattern:
|
|
479
|
-
|
|
559
|
+
if unique_capturing_group is None:
|
|
560
|
+
raise ValueError(
|
|
561
|
+
f"Source pattern '{pattern}' contains \\1 backreference, but no capturing groups "
|
|
562
|
+
f"found in target_patterns."
|
|
563
|
+
)
|
|
564
|
+
# Use the unique capturing group from target_patterns for all sources
|
|
565
|
+
pattern = pattern.replace(r"\1", unique_capturing_group, 1)
|
|
480
566
|
self.source_patterns[i] = pattern
|
|
481
567
|
|
|
482
568
|
# Construct the regex we will use to rename keys from the sources to the targets
|
|
@@ -552,7 +638,7 @@ class WeightTransform:
|
|
|
552
638
|
tensors = self.collected_tensors.pop(key)
|
|
553
639
|
# Async loading
|
|
554
640
|
if isinstance(tensors[0], Future):
|
|
555
|
-
tensors = [future.result() for future in tensors]
|
|
641
|
+
tensors = [future.result() for future in tensors if future.result() is not None]
|
|
556
642
|
# Sync loading
|
|
557
643
|
elif callable(tensors[0]):
|
|
558
644
|
tensors = [func() for func in tensors]
|
|
@@ -572,8 +658,7 @@ class WeightRenaming(WeightTransform):
|
|
|
572
658
|
model=None,
|
|
573
659
|
config=None,
|
|
574
660
|
hf_quantizer=None,
|
|
575
|
-
|
|
576
|
-
conversion_errors: MutableMapping[str, str] | None = None,
|
|
661
|
+
loading_info: LoadStateDictInfo | None = None,
|
|
577
662
|
):
|
|
578
663
|
# Collect the tensors here - we use a new dictionary to avoid keeping them in memory in the internal
|
|
579
664
|
# attribute during the whole process
|
|
@@ -586,7 +671,7 @@ class WeightRenaming(WeightTransform):
|
|
|
586
671
|
|
|
587
672
|
if hf_quantizer is not None and self.quantization_operation is not None:
|
|
588
673
|
with log_conversion_errors(
|
|
589
|
-
layer_name,
|
|
674
|
+
layer_name, loading_info, (len(collected_tensors), layer_name), self.quantization_operation
|
|
590
675
|
):
|
|
591
676
|
collected_tensors = self.quantization_operation.convert(
|
|
592
677
|
collected_tensors,
|
|
@@ -595,10 +680,10 @@ class WeightRenaming(WeightTransform):
|
|
|
595
680
|
full_layer_name=target_key,
|
|
596
681
|
model=model,
|
|
597
682
|
config=config,
|
|
598
|
-
missing_keys=missing_keys,
|
|
683
|
+
missing_keys=loading_info.missing_keys if loading_info else None,
|
|
599
684
|
)
|
|
600
685
|
|
|
601
|
-
return collected_tensors
|
|
686
|
+
return collected_tensors
|
|
602
687
|
|
|
603
688
|
|
|
604
689
|
# List of classes that are known to be able to use m:n
|
|
@@ -629,15 +714,14 @@ class WeightConverter(WeightTransform):
|
|
|
629
714
|
model=None,
|
|
630
715
|
config=None,
|
|
631
716
|
hf_quantizer=None,
|
|
632
|
-
|
|
633
|
-
conversion_errors: MutableMapping[str, str] | None = None,
|
|
717
|
+
loading_info: LoadStateDictInfo | None = None,
|
|
634
718
|
):
|
|
635
719
|
# Collect the tensors here - we use a new dictionary to avoid keeping them in memory in the internal
|
|
636
|
-
# attribute during the whole
|
|
720
|
+
# attribute during the whole proces
|
|
637
721
|
collected_tensors = self.materialize_tensors()
|
|
638
722
|
|
|
639
723
|
for op in self.operations:
|
|
640
|
-
with log_conversion_errors(layer_name,
|
|
724
|
+
with log_conversion_errors(layer_name, loading_info, (len(collected_tensors), layer_name), op):
|
|
641
725
|
collected_tensors = op.convert(
|
|
642
726
|
collected_tensors,
|
|
643
727
|
source_patterns=self.source_patterns,
|
|
@@ -646,7 +730,7 @@ class WeightConverter(WeightTransform):
|
|
|
646
730
|
full_layer_name=layer_name,
|
|
647
731
|
model=model,
|
|
648
732
|
config=config,
|
|
649
|
-
missing_keys=missing_keys,
|
|
733
|
+
missing_keys=loading_info.missing_keys if loading_info else None,
|
|
650
734
|
)
|
|
651
735
|
|
|
652
736
|
# Tensors are returned from ops with the target patterns, we need to expand them to full name.
|
|
@@ -665,7 +749,7 @@ class WeightConverter(WeightTransform):
|
|
|
665
749
|
|
|
666
750
|
if hf_quantizer is not None and self.quantization_operation is not None:
|
|
667
751
|
with log_conversion_errors(
|
|
668
|
-
layer_name,
|
|
752
|
+
layer_name, loading_info, (len(collected_tensors), layer_name), self.quantization_operation
|
|
669
753
|
):
|
|
670
754
|
collected_tensors = self.quantization_operation.convert(
|
|
671
755
|
collected_tensors,
|
|
@@ -674,9 +758,9 @@ class WeightConverter(WeightTransform):
|
|
|
674
758
|
full_layer_name=layer_name,
|
|
675
759
|
config=config,
|
|
676
760
|
model=model,
|
|
677
|
-
missing_keys=missing_keys,
|
|
761
|
+
missing_keys=loading_info.missing_keys if loading_info else None,
|
|
678
762
|
)
|
|
679
|
-
return collected_tensors
|
|
763
|
+
return collected_tensors
|
|
680
764
|
|
|
681
765
|
|
|
682
766
|
# For I/O bound operations (i.e. here reading files), it is better to have fewer threads, e.g. 4 is a good default.
|
|
@@ -739,7 +823,7 @@ def dot_natural_key(s: str):
|
|
|
739
823
|
@contextmanager
|
|
740
824
|
def log_conversion_errors(
|
|
741
825
|
first_target_key: str,
|
|
742
|
-
|
|
826
|
+
loading_info: LoadStateDictInfo | None,
|
|
743
827
|
extras: Any = None,
|
|
744
828
|
op: list[ConversionOps] | ConversionOps | None = None,
|
|
745
829
|
):
|
|
@@ -748,6 +832,9 @@ def log_conversion_errors(
|
|
|
748
832
|
try:
|
|
749
833
|
yield
|
|
750
834
|
except Exception as e:
|
|
835
|
+
# During reverse mapping, we do not log and skip errors
|
|
836
|
+
if loading_info is None:
|
|
837
|
+
raise e
|
|
751
838
|
|
|
752
839
|
def _format_op_name(curr_op: list[ConversionOps] | ConversionOps | None) -> str | None:
|
|
753
840
|
if curr_op is None:
|
|
@@ -760,19 +847,23 @@ def log_conversion_errors(
|
|
|
760
847
|
return curr_op.__class__.__name__
|
|
761
848
|
|
|
762
849
|
op_name = _format_op_name(op)
|
|
850
|
+
|
|
851
|
+
tb_str = "".join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
763
852
|
if isinstance(extras, tuple) and len(extras) == 2:
|
|
764
853
|
length, target_keys = extras
|
|
765
854
|
descriptor = f"{op_name} " if op_name else ""
|
|
766
|
-
conversion_errors[first_target_key] = (
|
|
767
|
-
f"{e}\nError: {descriptor}on tensors destined for {target_keys}. Ckpt contains: {length}"
|
|
855
|
+
loading_info.conversion_errors[first_target_key] = (
|
|
856
|
+
f"{tb_str}{e}\nError: {descriptor}on tensors destined for {target_keys}. Ckpt contains: {length}"
|
|
768
857
|
)
|
|
769
858
|
elif isinstance(extras, str):
|
|
770
859
|
suffix = f" via {op_name}" if op_name else ""
|
|
771
|
-
conversion_errors[first_target_key] =
|
|
860
|
+
loading_info.conversion_errors[first_target_key] = (
|
|
861
|
+
f"{tb_str}{e}\nError{suffix} when processing parameter {extras}"
|
|
862
|
+
)
|
|
772
863
|
elif extras is None and op_name:
|
|
773
|
-
conversion_errors[first_target_key] = f"{op_name}: {e}"
|
|
864
|
+
loading_info.conversion_errors[first_target_key] = f"{op_name}: {e}"
|
|
774
865
|
else:
|
|
775
|
-
conversion_errors[first_target_key] = f"{extras} |Error: {e}"
|
|
866
|
+
loading_info.conversion_errors[first_target_key] = f"{extras} |Error: {e}"
|
|
776
867
|
|
|
777
868
|
# Raise a specific Exception that we can catch easily
|
|
778
869
|
raise SkipParameters()
|
|
@@ -782,9 +873,7 @@ def set_param_for_module(
|
|
|
782
873
|
model: PreTrainedModel,
|
|
783
874
|
target_name: str,
|
|
784
875
|
param_value: torch.Tensor,
|
|
785
|
-
|
|
786
|
-
missing_keys: MutableSet[str],
|
|
787
|
-
unexpected_keys: MutableSet[str],
|
|
876
|
+
loading_info: LoadStateDictInfo,
|
|
788
877
|
distributed_operation: TensorParallelLayer | None,
|
|
789
878
|
hf_quantizer: HfQuantizer,
|
|
790
879
|
):
|
|
@@ -793,26 +882,23 @@ def set_param_for_module(
|
|
|
793
882
|
|
|
794
883
|
ref = getattr(module_obj, param_name)
|
|
795
884
|
if ref is None:
|
|
796
|
-
unexpected_keys.add(target_name)
|
|
885
|
+
loading_info.unexpected_keys.add(target_name)
|
|
797
886
|
else:
|
|
798
887
|
if not isinstance(param_value, torch.nn.Parameter):
|
|
799
|
-
if distributed_operation is not None:
|
|
800
|
-
if getattr(distributed_operation, "use_dtensor", False):
|
|
801
|
-
param_value = DTensor.from_local(
|
|
802
|
-
param_value,
|
|
803
|
-
distributed_operation.device_mesh,
|
|
804
|
-
getattr(distributed_operation, "shard", Replicate()),
|
|
805
|
-
run_check=False,
|
|
806
|
-
shape=ref.size(),
|
|
807
|
-
stride=ref.stride(),
|
|
808
|
-
)
|
|
809
888
|
if param_name not in module_obj._buffers:
|
|
810
889
|
param_value = torch.nn.Parameter(param_value, requires_grad=param_value.is_floating_point())
|
|
811
890
|
|
|
812
891
|
# Remove from missing keys (it's either mismatched, or all good)
|
|
813
|
-
missing_keys.discard(target_name)
|
|
814
|
-
|
|
815
|
-
|
|
892
|
+
loading_info.missing_keys.discard(target_name)
|
|
893
|
+
|
|
894
|
+
# Determine expected shape: for TP, use sharded shape; otherwise, use full shape
|
|
895
|
+
if distributed_operation is not None:
|
|
896
|
+
expected_shape = torch.Size(distributed_operation.get_expected_sharded_shape(ref.shape))
|
|
897
|
+
else:
|
|
898
|
+
expected_shape = ref.shape
|
|
899
|
+
|
|
900
|
+
if ref is not None and param_value.shape != expected_shape and hf_quantizer is None:
|
|
901
|
+
loading_info.mismatched_keys.add((target_name, param_value.shape, expected_shape))
|
|
816
902
|
else:
|
|
817
903
|
# super important otherwise _init_weight will re-init the param
|
|
818
904
|
param_value._is_hf_initialized = True
|
|
@@ -822,7 +908,7 @@ def set_param_for_module(
|
|
|
822
908
|
def offload_and_maybe_resave_param(
|
|
823
909
|
target_name: str,
|
|
824
910
|
param: torch.Tensor,
|
|
825
|
-
|
|
911
|
+
loading_info: LoadStateDictInfo,
|
|
826
912
|
disk_offload_folder: str,
|
|
827
913
|
disk_offload_index: dict,
|
|
828
914
|
applied_ops: WeightConverter | WeightRenaming,
|
|
@@ -831,7 +917,7 @@ def offload_and_maybe_resave_param(
|
|
|
831
917
|
WeightConverter operations have been applied, it will resave the new parameter. Otherwise, it will use the original
|
|
832
918
|
`disk_offload_index` for this given param."""
|
|
833
919
|
# We need to remove from missing keys
|
|
834
|
-
missing_keys.discard(target_name)
|
|
920
|
+
loading_info.missing_keys.discard(target_name)
|
|
835
921
|
# If not already offloaded, or if we applied any special Operation except Renaming, we need to re-save
|
|
836
922
|
if target_name not in disk_offload_index or isinstance(applied_ops, WeightConverter):
|
|
837
923
|
disk_offload_index = offload_weight(param, target_name, disk_offload_folder, disk_offload_index)
|
|
@@ -886,16 +972,9 @@ def rename_source_key(
|
|
|
886
972
|
def convert_and_load_state_dict_in_model(
|
|
887
973
|
model: PreTrainedModel,
|
|
888
974
|
state_dict: dict[str, Any],
|
|
889
|
-
|
|
975
|
+
load_config: LoadStateDictConfig,
|
|
890
976
|
tp_plan: dict[str, str] | None,
|
|
891
|
-
hf_quantizer: HfQuantizer | None,
|
|
892
|
-
dtype: torch.dtype | None = None,
|
|
893
|
-
device_map: dict | None = None,
|
|
894
|
-
dtype_plan: dict | None = None,
|
|
895
|
-
device_mesh: torch.distributed.device_mesh.DeviceMesh | None = None,
|
|
896
977
|
disk_offload_index: dict | None = None,
|
|
897
|
-
disk_offload_folder: str | None = None,
|
|
898
|
-
offload_buffers: bool = False,
|
|
899
978
|
):
|
|
900
979
|
r"""
|
|
901
980
|
We build a mapping from the keys obtained by renaming each of the checkpoint keys according to the weight_mapping rules.
|
|
@@ -985,16 +1064,25 @@ def convert_and_load_state_dict_in_model(
|
|
|
985
1064
|
"""
|
|
986
1065
|
prefix = model.base_model_prefix
|
|
987
1066
|
tp_plan = tp_plan or {}
|
|
988
|
-
device_map = device_map or {"": "cpu"}
|
|
989
|
-
|
|
990
|
-
|
|
1067
|
+
device_map = load_config.device_map or {"": "cpu"}
|
|
1068
|
+
hf_quantizer = load_config.hf_quantizer
|
|
1069
|
+
dtype = load_config.dtype
|
|
1070
|
+
device_mesh = load_config.device_mesh
|
|
1071
|
+
disk_offload_folder = load_config.disk_offload_folder
|
|
1072
|
+
offload_buffers = load_config.offload_buffers
|
|
1073
|
+
dtype_plan = load_config.dtype_plan or {}
|
|
1074
|
+
weight_mapping = load_config.weight_mapping or []
|
|
991
1075
|
meta_model_state_dict = model.state_dict()
|
|
992
1076
|
model_buffers = {k for k, _ in model.named_buffers()}
|
|
993
1077
|
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
1078
|
+
# We start from all missing keys, and we will remove/add them from the proper containers as loading advances
|
|
1079
|
+
loading_info = LoadStateDictInfo(
|
|
1080
|
+
missing_keys=set(meta_model_state_dict.keys()),
|
|
1081
|
+
unexpected_keys=set(),
|
|
1082
|
+
mismatched_keys=set(),
|
|
1083
|
+
conversion_errors={},
|
|
1084
|
+
error_msgs=[],
|
|
1085
|
+
)
|
|
998
1086
|
|
|
999
1087
|
# We use threading by default, if not explicitly deactivated via env variable. If we have to offload,
|
|
1000
1088
|
# we cannot use it either to control the memory as we are under memory constraints, so we need to be sequential
|
|
@@ -1024,7 +1112,7 @@ def convert_and_load_state_dict_in_model(
|
|
|
1024
1112
|
)
|
|
1025
1113
|
|
|
1026
1114
|
# 2. finally, collect the tensor into the proper converter
|
|
1027
|
-
if renamed_key in
|
|
1115
|
+
if renamed_key in meta_model_state_dict:
|
|
1028
1116
|
empty_param = meta_model_state_dict.get(renamed_key)
|
|
1029
1117
|
# If we enter here, we have a WeightConverter operation to perform
|
|
1030
1118
|
if source_pattern is not None:
|
|
@@ -1067,7 +1155,11 @@ def convert_and_load_state_dict_in_model(
|
|
|
1067
1155
|
mapping.distributed_operation = tp_layer(
|
|
1068
1156
|
device_mesh=device_mesh, rank=device_mesh.get_local_rank(), empty_param=empty_param.clone()
|
|
1069
1157
|
)
|
|
1070
|
-
shard_index =
|
|
1158
|
+
shard_index = (
|
|
1159
|
+
len(mapping.collected_tensors.get(source_pattern, []))
|
|
1160
|
+
if isinstance(mapping, WeightConverter) and isinstance(mapping.operations[0], MergeModulelist)
|
|
1161
|
+
else None
|
|
1162
|
+
)
|
|
1071
1163
|
future_or_tensor = spawn_tp_materialize(
|
|
1072
1164
|
thread_pool,
|
|
1073
1165
|
tensor,
|
|
@@ -1085,9 +1177,9 @@ def convert_and_load_state_dict_in_model(
|
|
|
1085
1177
|
elif source_pattern is not None: # add all target keys as unexpected
|
|
1086
1178
|
mapping = pattern_to_converter[source_pattern]
|
|
1087
1179
|
for k in mapping.target_patterns:
|
|
1088
|
-
unexpected_keys.add(renamed_key.replace(mapping.target_patterns[0], k))
|
|
1180
|
+
loading_info.unexpected_keys.add(renamed_key.replace(mapping.target_patterns[0], k))
|
|
1089
1181
|
else:
|
|
1090
|
-
unexpected_keys.add(renamed_key)
|
|
1182
|
+
loading_info.unexpected_keys.add(renamed_key)
|
|
1091
1183
|
|
|
1092
1184
|
try:
|
|
1093
1185
|
total_entries = len(param_name_to_load)
|
|
@@ -1097,13 +1189,12 @@ def convert_and_load_state_dict_in_model(
|
|
|
1097
1189
|
pbar.set_postfix({"Materializing param": first_param_name})
|
|
1098
1190
|
pbar.refresh()
|
|
1099
1191
|
try:
|
|
1100
|
-
realized_value
|
|
1192
|
+
realized_value = mapping.convert(
|
|
1101
1193
|
first_param_name,
|
|
1102
1194
|
model=model,
|
|
1103
1195
|
config=model.config,
|
|
1104
1196
|
hf_quantizer=hf_quantizer,
|
|
1105
|
-
|
|
1106
|
-
conversion_errors=conversion_errors,
|
|
1197
|
+
loading_info=loading_info,
|
|
1107
1198
|
)
|
|
1108
1199
|
for target_name, param in realized_value.items():
|
|
1109
1200
|
param = param[0] if isinstance(param, list) else param
|
|
@@ -1111,16 +1202,14 @@ def convert_and_load_state_dict_in_model(
|
|
|
1111
1202
|
# Offloading support
|
|
1112
1203
|
if param_device == "disk" and (target_name not in model_buffers or offload_buffers):
|
|
1113
1204
|
disk_offload_index = offload_and_maybe_resave_param(
|
|
1114
|
-
target_name, param,
|
|
1205
|
+
target_name, param, loading_info, disk_offload_folder, disk_offload_index, mapping
|
|
1115
1206
|
)
|
|
1116
1207
|
else:
|
|
1117
1208
|
set_param_for_module(
|
|
1118
1209
|
model,
|
|
1119
1210
|
target_name,
|
|
1120
1211
|
param,
|
|
1121
|
-
|
|
1122
|
-
missing_keys,
|
|
1123
|
-
unexpected_keys,
|
|
1212
|
+
loading_info,
|
|
1124
1213
|
mapping.distributed_operation,
|
|
1125
1214
|
hf_quantizer,
|
|
1126
1215
|
)
|
|
@@ -1139,7 +1228,7 @@ def convert_and_load_state_dict_in_model(
|
|
|
1139
1228
|
|
|
1140
1229
|
# Keep the current weight conversion mapping for later saving (in case it was coming directly from the user)
|
|
1141
1230
|
model._weight_conversions = weight_mapping
|
|
1142
|
-
return
|
|
1231
|
+
return loading_info, disk_offload_index
|
|
1143
1232
|
|
|
1144
1233
|
|
|
1145
1234
|
def revert_weight_conversion(model: PreTrainedModel, state_dict: dict[str, torch.Tensor]):
|
|
@@ -1186,7 +1275,7 @@ def revert_weight_conversion(model: PreTrainedModel, state_dict: dict[str, torch
|
|
|
1186
1275
|
new_state_dict = {}
|
|
1187
1276
|
for first_param_name, reversed_converter in conversion_mapping.items():
|
|
1188
1277
|
# Apply the reverse converter
|
|
1189
|
-
realized_value
|
|
1278
|
+
realized_value = reversed_converter.convert(first_param_name, model=model, config=model.config)
|
|
1190
1279
|
for target_name, param in realized_value.items():
|
|
1191
1280
|
param = param[0] if isinstance(param, list) else param
|
|
1192
1281
|
new_state_dict[target_name] = param
|
|
@@ -8,7 +8,6 @@ deps = {
|
|
|
8
8
|
"beautifulsoup4": "beautifulsoup4",
|
|
9
9
|
"blobfile": "blobfile",
|
|
10
10
|
"codecarbon": "codecarbon>=2.8.1",
|
|
11
|
-
"cookiecutter": "cookiecutter==1.7.3",
|
|
12
11
|
"datasets": "datasets>=2.15.0",
|
|
13
12
|
"deepspeed": "deepspeed>=0.9.3",
|
|
14
13
|
"diffusers": "diffusers",
|
|
@@ -17,11 +16,9 @@ deps = {
|
|
|
17
16
|
"faiss-cpu": "faiss-cpu",
|
|
18
17
|
"fastapi": "fastapi",
|
|
19
18
|
"filelock": "filelock",
|
|
20
|
-
"ftfy": "ftfy",
|
|
21
19
|
"fugashi": "fugashi>=1.0",
|
|
22
20
|
"GitPython": "GitPython<3.1.19",
|
|
23
21
|
"hf-doc-builder": "hf-doc-builder>=0.3.0",
|
|
24
|
-
"hf_xet": "hf_xet",
|
|
25
22
|
"huggingface-hub": "huggingface-hub>=1.3.0,<2.0",
|
|
26
23
|
"importlib_metadata": "importlib_metadata",
|
|
27
24
|
"ipadic": "ipadic>=1.0.0,<2.0",
|
|
@@ -30,7 +27,7 @@ deps = {
|
|
|
30
27
|
"kenlm": "kenlm",
|
|
31
28
|
"kernels": "kernels>=0.10.2,<0.11",
|
|
32
29
|
"librosa": "librosa",
|
|
33
|
-
"
|
|
30
|
+
"mistral-common[image]": "mistral-common[image]>=1.8.8",
|
|
34
31
|
"nltk": "nltk<=3.8.1",
|
|
35
32
|
"num2words": "num2words",
|
|
36
33
|
"numpy": "numpy>=1.17",
|
|
@@ -49,14 +46,14 @@ deps = {
|
|
|
49
46
|
"pydantic": "pydantic>=2",
|
|
50
47
|
"pytest": "pytest>=7.2.0,<9.0.0",
|
|
51
48
|
"pytest-asyncio": "pytest-asyncio>=1.2.0",
|
|
49
|
+
"pytest-random-order": "pytest-random-order",
|
|
52
50
|
"pytest-rerunfailures": "pytest-rerunfailures<16.0",
|
|
53
51
|
"pytest-timeout": "pytest-timeout",
|
|
52
|
+
"pytest-env": "pytest-env",
|
|
54
53
|
"pytest-xdist": "pytest-xdist",
|
|
55
54
|
"pytest-order": "pytest-order",
|
|
56
55
|
"python": "python>=3.10.0",
|
|
57
|
-
"ray[tune]": "ray[tune]>=2.7.0",
|
|
58
56
|
"regex": "regex!=2019.12.17",
|
|
59
|
-
"requests": "requests",
|
|
60
57
|
"rhoknp": "rhoknp>=1.1.0,<1.3.1",
|
|
61
58
|
"rjieba": "rjieba",
|
|
62
59
|
"rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
|
|
@@ -77,7 +74,7 @@ deps = {
|
|
|
77
74
|
"tiktoken": "tiktoken",
|
|
78
75
|
"timm": "timm>=1.0.23",
|
|
79
76
|
"tokenizers": "tokenizers>=0.22.0,<=0.23.0",
|
|
80
|
-
"torch": "torch>=2.
|
|
77
|
+
"torch": "torch>=2.4",
|
|
81
78
|
"torchaudio": "torchaudio",
|
|
82
79
|
"torchvision": "torchvision",
|
|
83
80
|
"pyctcdecode": "pyctcdecode>=0.4.0",
|
|
@@ -90,6 +87,8 @@ deps = {
|
|
|
90
87
|
"pytest-rich": "pytest-rich",
|
|
91
88
|
"libcst": "libcst",
|
|
92
89
|
"rich": "rich",
|
|
90
|
+
"ray[tune]": "ray[tune]>=2.7.0",
|
|
93
91
|
"opentelemetry-api": "opentelemetry-api",
|
|
94
|
-
"
|
|
92
|
+
"opentelemetry-exporter-otlp": "opentelemetry-exporter-otlp",
|
|
93
|
+
"opentelemetry-sdk": "opentelemetry-sdk",
|
|
95
94
|
}
|