transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -11,14 +11,27 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import copy
|
|
15
15
|
import inspect
|
|
16
16
|
import json
|
|
17
17
|
import os
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
from
|
|
21
|
-
|
|
18
|
+
import re
|
|
19
|
+
from dataclasses import replace
|
|
20
|
+
from typing import TYPE_CHECKING, Any, Literal, Optional
|
|
21
|
+
|
|
22
|
+
from ..conversion_mapping import (
|
|
23
|
+
_MODEL_TO_CONVERSION_PATTERN,
|
|
24
|
+
get_checkpoint_conversion_mapping,
|
|
25
|
+
get_model_conversion_mapping,
|
|
26
|
+
)
|
|
27
|
+
from ..core_model_loading import (
|
|
28
|
+
Concatenate,
|
|
29
|
+
ConversionOps,
|
|
30
|
+
MergeModulelist,
|
|
31
|
+
Transpose,
|
|
32
|
+
WeightConverter,
|
|
33
|
+
WeightRenaming,
|
|
34
|
+
)
|
|
22
35
|
from ..utils import (
|
|
23
36
|
CONFIG_NAME,
|
|
24
37
|
cached_file,
|
|
@@ -31,6 +44,7 @@ from ..utils import (
|
|
|
31
44
|
logging,
|
|
32
45
|
)
|
|
33
46
|
from ..utils.hub import DownloadKwargs
|
|
47
|
+
from ..utils.loading_report import log_state_dict_report
|
|
34
48
|
|
|
35
49
|
|
|
36
50
|
if is_torch_available():
|
|
@@ -46,6 +60,304 @@ MIN_PEFT_VERSION = "0.18.0"
|
|
|
46
60
|
|
|
47
61
|
logger = logging.get_logger(__name__)
|
|
48
62
|
|
|
63
|
+
if TYPE_CHECKING:
|
|
64
|
+
from ..modeling_utils import LoadStateDictConfig
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _block_diag_3d(*tensors):
|
|
68
|
+
lora_b_block_diag = []
|
|
69
|
+
for i in range(len(tensors[0])):
|
|
70
|
+
lora_b_block_diag.append(torch.block_diag(tensors[0][i], tensors[1][i]))
|
|
71
|
+
out = torch.stack(lora_b_block_diag, dim=0)
|
|
72
|
+
return out
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class PeftConcatenate(Concatenate):
|
|
76
|
+
"""Convert per-expert LoRA weights to merged weights.
|
|
77
|
+
|
|
78
|
+
When the base weights are fused, e.g. W01 = [W0, W1], the LoRA weights also need to be fused. To achieve this
|
|
79
|
+
correctly, concatenate the LoRA A weights along the r (rank) dimension. This doesn't require a new Operation. But
|
|
80
|
+
for LoRA B, the weights need to be merged in a block diagonal fashion to achieve the correct result.
|
|
81
|
+
|
|
82
|
+
To illustrate:
|
|
83
|
+
|
|
84
|
+
Before
|
|
85
|
+
W0' = W0 + A0 @ B0
|
|
86
|
+
W1' = W1 + A1 @ B1
|
|
87
|
+
|
|
88
|
+
After
|
|
89
|
+
W01' = W01 + A01 @ B01_bd
|
|
90
|
+
where
|
|
91
|
+
A01 = [A0, A1]
|
|
92
|
+
B01_bd = [[B0, 0],
|
|
93
|
+
[0, B1]]
|
|
94
|
+
|
|
95
|
+
This class is responsible for merging LoRA B in this block-diagonal fashion. Assuming that we fuse N weights, it
|
|
96
|
+
should look like this:
|
|
97
|
+
|
|
98
|
+
1. LoRA B is 2-dim
|
|
99
|
+
Normal LoRA weight of shape (out_feat, rank), the output shape should be (N * out_feat, N * rank).
|
|
100
|
+
|
|
101
|
+
2. LoRA B is 3-dim
|
|
102
|
+
MoE LoRA weight of shape (experts, out_feat, rank), the output shape should be (experts, N * out_feat, N * rank).
|
|
103
|
+
|
|
104
|
+
After this, the experts x rank dimension are flattened, as PEFT expects 2d tensors for LoRA.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
@torch.no_grad
|
|
108
|
+
def convert(
|
|
109
|
+
self,
|
|
110
|
+
input_dict: dict[str, list[torch.Tensor]],
|
|
111
|
+
source_patterns: list[str],
|
|
112
|
+
target_patterns: list[str],
|
|
113
|
+
full_layer_name: str,
|
|
114
|
+
**kwargs,
|
|
115
|
+
) -> dict[str, list[torch.Tensor]]:
|
|
116
|
+
dims = [v.dim() for v in input_dict.values()]
|
|
117
|
+
if set(dims) not in ({2}, {3}):
|
|
118
|
+
raise ValueError(
|
|
119
|
+
f"To convert this LoRA adapter, the LoRA weights all need to have either 2 or 3 dims, got {set(dims)}"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
if set(dims) == {2}:
|
|
123
|
+
output_dict = {full_layer_name: torch.block_diag(*input_dict.values())}
|
|
124
|
+
else:
|
|
125
|
+
out = _block_diag_3d(*input_dict.values()) # shape = experts, 2*out_feat, 2*r
|
|
126
|
+
out = torch.permute(out, (2, 0, 1)) # shape = 2*r, experts, 2*out_feat
|
|
127
|
+
out = out.flatten(0, 1) # shape = 2*r * experts, 2*out_feat
|
|
128
|
+
out = out.T
|
|
129
|
+
output_dict = {full_layer_name: out}
|
|
130
|
+
return output_dict
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def reverse_op(self) -> ConversionOps:
|
|
134
|
+
raise NotImplementedError("Reversing PEFT LoRA MoE conversions is not supported yet.")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class FlattenDims(ConversionOps):
|
|
138
|
+
"""
|
|
139
|
+
Flatten the tensors along the given dimensions
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
def __init__(self, dims: int | tuple[int, ...]):
|
|
143
|
+
if isinstance(dims, int):
|
|
144
|
+
dims = (dims,)
|
|
145
|
+
self.dims = dims
|
|
146
|
+
|
|
147
|
+
@torch.no_grad
|
|
148
|
+
def convert(
|
|
149
|
+
self,
|
|
150
|
+
input_dict: dict[str, list[torch.Tensor]],
|
|
151
|
+
source_patterns: list[str],
|
|
152
|
+
target_patterns: list[str],
|
|
153
|
+
config,
|
|
154
|
+
**kwargs,
|
|
155
|
+
) -> dict[str, list[torch.Tensor]]:
|
|
156
|
+
output_dict = {k: v.flatten(*self.dims) for k, v in input_dict.items()}
|
|
157
|
+
return output_dict
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def reverse_op(self) -> ConversionOps:
|
|
161
|
+
raise NotImplementedError("Reversing flatteing operatio is not supported.")
|
|
162
|
+
|
|
163
|
+
def __repr__(self):
|
|
164
|
+
return f"{self.__class__.__name__}(dims={self.dims})"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class PermuteDims(ConversionOps):
|
|
168
|
+
"""
|
|
169
|
+
Permute the tensors along the given dimensions
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
def __init__(self, dims: tuple[int, ...]):
|
|
173
|
+
self.dims = dims
|
|
174
|
+
|
|
175
|
+
@torch.no_grad
|
|
176
|
+
def convert(
|
|
177
|
+
self,
|
|
178
|
+
input_dict: dict[str, list[torch.Tensor]],
|
|
179
|
+
source_patterns: list[str],
|
|
180
|
+
target_patterns: list[str],
|
|
181
|
+
config,
|
|
182
|
+
**kwargs,
|
|
183
|
+
) -> dict[str, list[torch.Tensor]]:
|
|
184
|
+
output_dict = {k: v.permute(*self.dims) for k, v in input_dict.items()}
|
|
185
|
+
return output_dict
|
|
186
|
+
|
|
187
|
+
@property
|
|
188
|
+
def reverse_op(self) -> ConversionOps:
|
|
189
|
+
raise NotImplementedError("Reversing flatteing operatio is not supported yet.")
|
|
190
|
+
|
|
191
|
+
def __repr__(self):
|
|
192
|
+
return f"{self.__class__.__name__}(dims={self.dims})"
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _build_peft_weight_mapping(
|
|
196
|
+
weight_conversions: list[WeightConverter | WeightRenaming] | None, adapter_name: str, peft_config=None
|
|
197
|
+
) -> list[WeightConverter | WeightRenaming]:
|
|
198
|
+
# We iterate over all the operations of the original model and simply edit them to apply to the PEFT adapter when
|
|
199
|
+
# appropriate.
|
|
200
|
+
if not weight_conversions:
|
|
201
|
+
return []
|
|
202
|
+
|
|
203
|
+
# strip "base_model.model" and add adapter name
|
|
204
|
+
new_weight_conversions = [WeightRenaming("base_model.model.model.", "model.")]
|
|
205
|
+
|
|
206
|
+
prefixes = set()
|
|
207
|
+
from peft.mapping import PEFT_TYPE_TO_PREFIX_MAPPING
|
|
208
|
+
|
|
209
|
+
peft_type = getattr(peft_config, "peft_type", None)
|
|
210
|
+
if peft_type in PEFT_TYPE_TO_PREFIX_MAPPING:
|
|
211
|
+
prefixes.add(PEFT_TYPE_TO_PREFIX_MAPPING[peft_type])
|
|
212
|
+
else:
|
|
213
|
+
prefixes.update(PEFT_TYPE_TO_PREFIX_MAPPING.values())
|
|
214
|
+
|
|
215
|
+
for prefix in sorted(prefixes):
|
|
216
|
+
escaped_prefix = re.escape(prefix)
|
|
217
|
+
new_weight_conversions.append(
|
|
218
|
+
WeightRenaming(
|
|
219
|
+
source_patterns=rf"({escaped_prefix}[^\.]*)",
|
|
220
|
+
target_patterns=rf"\1.{adapter_name}",
|
|
221
|
+
)
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
for orig_conversion in weight_conversions:
|
|
225
|
+
if isinstance(orig_conversion, WeightRenaming):
|
|
226
|
+
new_weight_conversions.append(orig_conversion)
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
if orig_conversion.target_patterns == ["mlp.experts.gate_up_proj"]:
|
|
230
|
+
# gate_up_proj requires both merging the experts and concatenating for the fusion of w1 and w3
|
|
231
|
+
for lora in ("lora_A", "lora_B"): # TODO: lora_embedding_A and lora_embedding_B
|
|
232
|
+
# deal with operations
|
|
233
|
+
peft_weight_operations = []
|
|
234
|
+
for op in orig_conversion.operations:
|
|
235
|
+
if isinstance(op, Concatenate):
|
|
236
|
+
if lora == "lora_B": # block diagonal concat
|
|
237
|
+
peft_weight_operations.append(PeftConcatenate(dim=op.dim))
|
|
238
|
+
else: # normal concat + flatten
|
|
239
|
+
peft_weight_operations.append(op)
|
|
240
|
+
peft_weight_operations.append(FlattenDims(dims=(0, 1)))
|
|
241
|
+
elif isinstance(op, MergeModulelist):
|
|
242
|
+
peft_weight_operations.append(op)
|
|
243
|
+
|
|
244
|
+
# TODO: this assumption may not hold for models != mixtral
|
|
245
|
+
# For source, we capture the orignal weights + the lora weights
|
|
246
|
+
new_source_patterns = []
|
|
247
|
+
for pat in list(orig_conversion.source_patterns):
|
|
248
|
+
# we replace the weight pattern to colllect loras
|
|
249
|
+
pat = pat.rsplit(".", 1)[0]
|
|
250
|
+
# note: the source state_dict does *not* contain the adapter name
|
|
251
|
+
new_source_patterns.append(f"{pat}.{lora}.*")
|
|
252
|
+
|
|
253
|
+
# the gate_up_proj is the innner PEFT ParamWrapper, so we need to use base_layer
|
|
254
|
+
pat = orig_conversion.target_patterns[0]
|
|
255
|
+
pat = pat.replace("gate_up_proj", "base_layer")
|
|
256
|
+
# we make sure the target key is correct, add '.weight' because the parameter is targeted directly
|
|
257
|
+
new_target_patterns = [f"{pat}.{lora}.{adapter_name}.weight"]
|
|
258
|
+
|
|
259
|
+
# Instantiate a new object that correctly post process patterns if needed
|
|
260
|
+
new_conversion = orig_conversion.__class__(
|
|
261
|
+
source_patterns=new_source_patterns,
|
|
262
|
+
target_patterns=new_target_patterns,
|
|
263
|
+
distributed_operation=orig_conversion.distributed_operation,
|
|
264
|
+
quantization_operation=orig_conversion.quantizatin_operations,
|
|
265
|
+
operations=new_weight_conversions,
|
|
266
|
+
)
|
|
267
|
+
new_weight_conversions.append(new_conversion)
|
|
268
|
+
|
|
269
|
+
elif orig_conversion.target_patterns == ["mlp.experts.down_proj"]:
|
|
270
|
+
# down_proj only requires merging of experts
|
|
271
|
+
for lora in ("lora_A", "lora_B"): # TODO: lora_embedding_A and lora_embedding_B
|
|
272
|
+
peft_weight_operations = []
|
|
273
|
+
for op in orig_conversion.operations:
|
|
274
|
+
if isinstance(op, MergeModulelist):
|
|
275
|
+
peft_weight_operations.append(op)
|
|
276
|
+
if lora == "lora_A":
|
|
277
|
+
peft_weight_operations.append(FlattenDims(dims=(0, 1)))
|
|
278
|
+
else:
|
|
279
|
+
peft_weight_operations.append(PermuteDims(dims=(2, 0, 1)))
|
|
280
|
+
peft_weight_operations.append(FlattenDims(dims=(0, 1)))
|
|
281
|
+
peft_weight_operations.append(Transpose(dim0=0, dim1=1))
|
|
282
|
+
|
|
283
|
+
# TODO: this assumption may not hold for models != mixtral
|
|
284
|
+
# For source, we capture the orignal weights + the lora weights
|
|
285
|
+
new_source_patterns = []
|
|
286
|
+
for pat in list(orig_conversion.source_patterns):
|
|
287
|
+
# we replace the weight pattern to colllect loras
|
|
288
|
+
pat = pat.rsplit(".", 1)[0]
|
|
289
|
+
# note: the source state_dict does *not* contain the adapter name
|
|
290
|
+
new_source_patterns.append(f"{pat}.{lora}.*")
|
|
291
|
+
|
|
292
|
+
# the down_proj is the outer PEFT ParamWrapper, so we remove the prefix
|
|
293
|
+
pat = orig_conversion.target_patterns[0]
|
|
294
|
+
pat = pat.replace(".down_proj", "")
|
|
295
|
+
# we make sure the target key is correct, add '.weight' because the parameter is targeted directly
|
|
296
|
+
new_target_patterns = [f"{pat}.{lora}.{adapter_name}.weight"]
|
|
297
|
+
|
|
298
|
+
# Instantiate a new object that correctly post process patterns if needed
|
|
299
|
+
new_conversion = orig_conversion.__class__(
|
|
300
|
+
source_patterns=new_source_patterns,
|
|
301
|
+
target_patterns=new_target_patterns,
|
|
302
|
+
distributed_operation=orig_conversion.distributed_operation,
|
|
303
|
+
quantization_operation=orig_conversion.quantizatin_operations,
|
|
304
|
+
operations=new_weight_conversions,
|
|
305
|
+
)
|
|
306
|
+
new_weight_conversions.append(new_conversion)
|
|
307
|
+
|
|
308
|
+
return new_weight_conversions
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
# The main reason we have to explicit this is because the conversion mapping
|
|
312
|
+
# has the full layer name, while the config do not. We coould regex match but
|
|
313
|
+
# this is more explicit and less error prone.
|
|
314
|
+
_MOE_TARGET_MODULE_MAPPING: dict[str, dict[str, str]] = {
|
|
315
|
+
"mixtral": {
|
|
316
|
+
"gate": "gate.weight",
|
|
317
|
+
"w1": "gate_up_proj",
|
|
318
|
+
"w3": "gate_up_proj",
|
|
319
|
+
"w2": "down_proj",
|
|
320
|
+
},
|
|
321
|
+
"qwen2_moe": {
|
|
322
|
+
"gate": "gate.weight",
|
|
323
|
+
"gate_proj": "gate_up_proj",
|
|
324
|
+
"up_proj": "gate_up_proj",
|
|
325
|
+
"down_proj": "down_proj",
|
|
326
|
+
},
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
_MOE_FUSED_TARGETS: dict[str, dict[str, set[str]]] = {
|
|
330
|
+
"mixtral": {"gate_up_proj": {"w1", "w3"}},
|
|
331
|
+
"qwen2_moe": {"gate_up_proj": {"gate_proj", "up_proj"}},
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def patch_moe_parameter_targeting(model, peft_config):
|
|
336
|
+
"""PEFT currently assumes that expert layers are of shape
|
|
337
|
+
(expert, in, out)
|
|
338
|
+
but with Mixtral in transformers v5 this is not true anymore.
|
|
339
|
+
This will be addressed in PEFT >0.19 until then we need to handle
|
|
340
|
+
it here for now.
|
|
341
|
+
"""
|
|
342
|
+
from functools import wraps
|
|
343
|
+
|
|
344
|
+
import peft
|
|
345
|
+
|
|
346
|
+
model_type = getattr(model.config, "model_type", None)
|
|
347
|
+
if get_checkpoint_conversion_mapping(model_type) is not None:
|
|
348
|
+
update_layer = peft.tuners.lora.layer.ParamWrapper.update_layer
|
|
349
|
+
|
|
350
|
+
@wraps(update_layer)
|
|
351
|
+
def new_update_layer(layer, *args, **kwargs):
|
|
352
|
+
if not hasattr(layer, "_swapped_in_out") and layer.parameter_name in ("down_proj", "gate_up_proj"):
|
|
353
|
+
tmp_in_features = layer.in_features
|
|
354
|
+
layer.in_features = layer.out_features
|
|
355
|
+
layer.out_features = tmp_in_features
|
|
356
|
+
layer._swapped_in_out = True
|
|
357
|
+
return update_layer(layer, *args, **kwargs)
|
|
358
|
+
|
|
359
|
+
peft.tuners.lora.layer.ParamWrapper.update_layer = new_update_layer
|
|
360
|
+
|
|
49
361
|
|
|
50
362
|
class PeftAdapterMixin:
|
|
51
363
|
"""
|
|
@@ -73,12 +385,6 @@ class PeftAdapterMixin:
|
|
|
73
385
|
self,
|
|
74
386
|
peft_model_id: str | None = None,
|
|
75
387
|
adapter_name: str | None = None,
|
|
76
|
-
revision: str | None = None,
|
|
77
|
-
token: str | None = None,
|
|
78
|
-
device_map: str = "auto",
|
|
79
|
-
max_memory: str | None = None,
|
|
80
|
-
offload_folder: str | None = None,
|
|
81
|
-
offload_index: int | None = None,
|
|
82
388
|
peft_config: dict[str, Any] | None = None,
|
|
83
389
|
adapter_state_dict: dict[str, "torch.Tensor"] | None = None,
|
|
84
390
|
low_cpu_mem_usage: bool = False,
|
|
@@ -86,6 +392,8 @@ class PeftAdapterMixin:
|
|
|
86
392
|
hotswap: bool | Literal["auto"] = "auto",
|
|
87
393
|
local_files_only: bool = False,
|
|
88
394
|
adapter_kwargs: dict[str, Any] | None = None,
|
|
395
|
+
load_config: Optional["LoadStateDictConfig"] = None,
|
|
396
|
+
**kwargs,
|
|
89
397
|
) -> None:
|
|
90
398
|
"""
|
|
91
399
|
Load adapter weights from file or remote Hub folder. If you are not familiar with adapters and PEFT methods, we
|
|
@@ -99,35 +407,10 @@ class PeftAdapterMixin:
|
|
|
99
407
|
and adapter weights.
|
|
100
408
|
adapter_name (`str`, *optional*):
|
|
101
409
|
The adapter name to use. If not set, will use the name "default".
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
> [!TIP]
|
|
108
|
-
> To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.
|
|
109
|
-
|
|
110
|
-
token (`str`, `optional`):
|
|
111
|
-
Whether to use authentication token to load the remote folder. Useful to load private repositories
|
|
112
|
-
that are on HuggingFace Hub. You might need to call `hf auth login` and paste your tokens to
|
|
113
|
-
cache it.
|
|
114
|
-
device_map (`str` or `dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
|
|
115
|
-
A map that specifies where each submodule should go. It doesn't need to be refined to each
|
|
116
|
-
parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
|
|
117
|
-
same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
|
|
118
|
-
like `1`) on which the model will be allocated, the device map will map the entire model to this
|
|
119
|
-
device. Passing `device_map = 0` means put the whole model on GPU 0.
|
|
120
|
-
|
|
121
|
-
To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
|
|
122
|
-
more information about each option see [designing a device
|
|
123
|
-
map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
|
|
124
|
-
max_memory (`Dict`, *optional*):
|
|
125
|
-
A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
|
|
126
|
-
GPU and the available CPU RAM if unset.
|
|
127
|
-
offload_folder (`str` or `os.PathLike`, `optional`):
|
|
128
|
-
If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
|
|
129
|
-
offload_index (`int`, `optional`):
|
|
130
|
-
`offload_index` argument to be passed to `accelerate.dispatch_model` method.
|
|
410
|
+
load_config (`LoadStateDictConfig`, *optional*):
|
|
411
|
+
A load configuration to reuse when pulling adapter weights, typically from `from_pretrained`.
|
|
412
|
+
kwargs (`dict[str, Any]`, *optional*):
|
|
413
|
+
Additional `LoadStateDictConfig` fields passed as keyword arguments.
|
|
131
414
|
peft_config (`dict[str, Any]`, *optional*):
|
|
132
415
|
The configuration of the adapter to add, supported adapters are all non-prompt learning configs (LoRA,
|
|
133
416
|
IA³, etc). This argument is used in case users directly pass PEFT state dicts.
|
|
@@ -175,10 +458,18 @@ class PeftAdapterMixin:
|
|
|
175
458
|
Additional keyword arguments passed along to the `from_pretrained` method of the adapter config and
|
|
176
459
|
`find_adapter_config_file` method.
|
|
177
460
|
"""
|
|
178
|
-
check_peft_version(min_version=MIN_PEFT_VERSION)
|
|
179
|
-
|
|
180
461
|
from peft import PeftType
|
|
181
462
|
|
|
463
|
+
from ..modeling_utils import LoadStateDictConfig, _get_resolved_checkpoint_files
|
|
464
|
+
|
|
465
|
+
if local_files_only:
|
|
466
|
+
kwargs["local_files_only"] = True
|
|
467
|
+
base_load_config = load_config.__dict__ if load_config is not None else {}
|
|
468
|
+
base_load_config.update(kwargs)
|
|
469
|
+
base_load_config.setdefault("pretrained_model_name_or_path", None)
|
|
470
|
+
load_config = LoadStateDictConfig(**base_load_config)
|
|
471
|
+
peft_model_id = peft_model_id or load_config.pretrained_model_name_or_path
|
|
472
|
+
|
|
182
473
|
if hotswap == "auto":
|
|
183
474
|
# if user called model.enable_peft_hotswap and this is not the first adapter, enable hotswap
|
|
184
475
|
hotswap_enabled = getattr(self, "_hotswap_enabled", False)
|
|
@@ -193,18 +484,10 @@ class PeftAdapterMixin:
|
|
|
193
484
|
if any(conf.peft_type != PeftType.LORA for conf in self.peft_config.values()):
|
|
194
485
|
raise ValueError("Hotswapping is currently only supported for LoRA, please set `hotswap=False`.")
|
|
195
486
|
|
|
196
|
-
key_mapping = adapter_kwargs.pop("key_mapping", None) if adapter_kwargs is not None else None
|
|
197
|
-
weight_conversions = get_model_conversion_mapping(self, key_mapping=key_mapping)
|
|
198
|
-
# peft only supports low_cpu_mem_usage starting from v0.13.0
|
|
199
|
-
peft_load_kwargs = {}
|
|
200
|
-
peft_load_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
|
|
201
|
-
|
|
202
487
|
adapter_name = adapter_name if adapter_name is not None else "default"
|
|
203
|
-
|
|
204
|
-
adapter_kwargs = {}
|
|
488
|
+
adapter_kwargs = adapter_kwargs or {}
|
|
205
489
|
|
|
206
|
-
from peft import PeftConfig, inject_adapter_in_model
|
|
207
|
-
from peft.utils import set_peft_model_state_dict
|
|
490
|
+
from peft import PeftConfig, inject_adapter_in_model
|
|
208
491
|
|
|
209
492
|
if self._hf_peft_config_loaded and (not hotswap) and (adapter_name in self.peft_config):
|
|
210
493
|
raise ValueError(f"Adapter with name {adapter_name} already exists. Please use a different name.")
|
|
@@ -218,34 +501,11 @@ class PeftAdapterMixin:
|
|
|
218
501
|
"You should either pass a `peft_model_id` or a `peft_config` and `adapter_state_dict` to load an adapter."
|
|
219
502
|
)
|
|
220
503
|
|
|
221
|
-
if "device" not in adapter_kwargs:
|
|
222
|
-
device = self.device if not hasattr(self, "hf_device_map") else list(self.hf_device_map.values())[0]
|
|
223
|
-
else:
|
|
224
|
-
device = adapter_kwargs.pop("device")
|
|
225
|
-
|
|
226
|
-
# To avoid PEFT errors later on with safetensors.
|
|
227
|
-
if isinstance(device, torch.device):
|
|
228
|
-
device = str(device)
|
|
229
|
-
|
|
230
|
-
# We keep `revision` in the signature for backward compatibility
|
|
231
|
-
if revision is not None and "revision" not in adapter_kwargs:
|
|
232
|
-
adapter_kwargs["revision"] = revision
|
|
233
|
-
elif revision is not None and "revision" in adapter_kwargs and revision != adapter_kwargs["revision"]:
|
|
234
|
-
logger.error(
|
|
235
|
-
"You passed a `revision` argument both in `adapter_kwargs` and as a standalone argument. "
|
|
236
|
-
"The one in `adapter_kwargs` will be used."
|
|
237
|
-
)
|
|
238
|
-
|
|
239
|
-
# Override token with adapter_kwargs' token
|
|
240
|
-
if "token" in adapter_kwargs:
|
|
241
|
-
token = adapter_kwargs.pop("token")
|
|
242
|
-
|
|
243
504
|
if peft_config is None:
|
|
505
|
+
load_config.download_kwargs.update(**adapter_kwargs)
|
|
244
506
|
adapter_config_file = find_adapter_config_file(
|
|
245
507
|
peft_model_id,
|
|
246
|
-
|
|
247
|
-
local_files_only=local_files_only,
|
|
248
|
-
**adapter_kwargs,
|
|
508
|
+
**load_config.download_kwargs,
|
|
249
509
|
)
|
|
250
510
|
|
|
251
511
|
if adapter_config_file is None:
|
|
@@ -256,120 +516,85 @@ class PeftAdapterMixin:
|
|
|
256
516
|
|
|
257
517
|
peft_config = PeftConfig.from_pretrained(
|
|
258
518
|
peft_model_id,
|
|
259
|
-
|
|
260
|
-
local_files_only=local_files_only,
|
|
261
|
-
**adapter_kwargs,
|
|
519
|
+
**load_config.download_kwargs,
|
|
262
520
|
)
|
|
521
|
+
|
|
522
|
+
weight_conversions = get_model_conversion_mapping(self)
|
|
523
|
+
peft_config = convert_peft_config_for_transformers(peft_config, model=self, conversions=weight_conversions)
|
|
524
|
+
|
|
525
|
+
if hasattr(peft_config, "inference_mode"):
|
|
263
526
|
peft_config.inference_mode = not is_trainable
|
|
264
527
|
|
|
528
|
+
peft_weight_conversions = _build_peft_weight_mapping(weight_conversions, adapter_name, peft_config=peft_config)
|
|
529
|
+
|
|
530
|
+
patch_moe_parameter_targeting(model=self, peft_config=peft_config)
|
|
531
|
+
|
|
265
532
|
if not hotswap:
|
|
266
|
-
# TODO: WE NEED TOO APPLY OUR DYNAMIC WEIGHT CONVERSION AT SOME POINT HERE!
|
|
267
533
|
# Create and add fresh new adapters into the model, unless the weights are hotswapped
|
|
268
|
-
inject_adapter_in_model(peft_config, self, adapter_name
|
|
534
|
+
inject_adapter_in_model(peft_config, self, adapter_name)
|
|
269
535
|
|
|
270
536
|
if not self._hf_peft_config_loaded:
|
|
271
537
|
self._hf_peft_config_loaded = True
|
|
272
538
|
|
|
273
|
-
if
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
elif key.endswith("lora_B.bias"): # lora_bias=True option
|
|
298
|
-
new_key = new_key[: -len(".bias")] + f".{adapter_name}.bias"
|
|
299
|
-
processed_adapter_state_dict[new_key] = value
|
|
300
|
-
|
|
301
|
-
# Load state dict
|
|
302
|
-
if not hotswap:
|
|
303
|
-
incompatible_keys = set_peft_model_state_dict(
|
|
304
|
-
self, processed_adapter_state_dict, adapter_name, **peft_load_kwargs
|
|
305
|
-
)
|
|
539
|
+
if adapter_state_dict is None:
|
|
540
|
+
adapter_filenames = ["adapter_model.safetensors", "adapter_model.bin"]
|
|
541
|
+
if load_config.use_safetensors is False:
|
|
542
|
+
adapter_filenames.reverse()
|
|
543
|
+
|
|
544
|
+
checkpoint_files = sharded_metadata = None
|
|
545
|
+
last_error = None
|
|
546
|
+
for adapter_filename in adapter_filenames:
|
|
547
|
+
try:
|
|
548
|
+
checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
|
|
549
|
+
pretrained_model_name_or_path=peft_model_id,
|
|
550
|
+
variant=None,
|
|
551
|
+
gguf_file=None,
|
|
552
|
+
use_safetensors=(
|
|
553
|
+
load_config.use_safetensors if adapter_filename.endswith(".safetensors") else False
|
|
554
|
+
),
|
|
555
|
+
user_agent=None,
|
|
556
|
+
is_remote_code=False,
|
|
557
|
+
transformers_explicit_filename=adapter_filename,
|
|
558
|
+
download_kwargs=load_config.download_kwargs,
|
|
559
|
+
)
|
|
560
|
+
break
|
|
561
|
+
except OSError as error:
|
|
562
|
+
last_error = error
|
|
306
563
|
|
|
307
|
-
if
|
|
308
|
-
|
|
309
|
-
# If the user called enable_peft_hotswap, we need to ensure it is called:
|
|
310
|
-
# - after the first adapter was loaded
|
|
311
|
-
# - before the model is compiled and the 2nd adapter is being hotswapped in
|
|
312
|
-
# Therefore, it needs to be called here
|
|
313
|
-
from peft.utils.hotswap import prepare_model_for_compiled_hotswap
|
|
314
|
-
|
|
315
|
-
prepare_model_for_compiled_hotswap(self, config=peft_config, **self._prepare_peft_hotswap_kwargs)
|
|
316
|
-
# We only want to call prepare_model_for_compiled_hotswap once
|
|
317
|
-
self._prepare_peft_hotswap_kwargs = None
|
|
564
|
+
if checkpoint_files is None:
|
|
565
|
+
raise last_error or OSError("Could not download either a .bin or a .safetensors adapter file.")
|
|
318
566
|
else:
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
check_hotswap_configs_compatible(self.peft_config[adapter_name], peft_config)
|
|
322
|
-
try:
|
|
323
|
-
hotswap_adapter_from_state_dict(
|
|
324
|
-
model=self,
|
|
325
|
-
state_dict=processed_adapter_state_dict,
|
|
326
|
-
adapter_name=adapter_name,
|
|
327
|
-
config=peft_config,
|
|
328
|
-
)
|
|
329
|
-
except Exception as e:
|
|
330
|
-
logger.error(f"Hotswapping {adapter_name} was unsucessful with the following error: \n{e}")
|
|
331
|
-
raise
|
|
332
|
-
incompatible_keys = None
|
|
333
|
-
|
|
334
|
-
if incompatible_keys is not None:
|
|
335
|
-
err_msg = ""
|
|
336
|
-
origin_name = peft_model_id if peft_model_id is not None else "state_dict"
|
|
337
|
-
# Check for unexpected keys.
|
|
338
|
-
if hasattr(incompatible_keys, "unexpected_keys") and len(incompatible_keys.unexpected_keys) > 0:
|
|
339
|
-
err_msg = (
|
|
340
|
-
f"Loading adapter weights from {origin_name} led to unexpected keys not found in the model: "
|
|
341
|
-
f"{', '.join(incompatible_keys.unexpected_keys)}. "
|
|
342
|
-
)
|
|
567
|
+
checkpoint_files, sharded_metadata = [], {}
|
|
343
568
|
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
569
|
+
load_config = replace(
|
|
570
|
+
load_config,
|
|
571
|
+
pretrained_model_name_or_path=peft_model_id,
|
|
572
|
+
sharded_metadata=sharded_metadata,
|
|
573
|
+
weight_mapping=peft_weight_conversions,
|
|
574
|
+
)
|
|
575
|
+
loading_info, _ = self._load_pretrained_model(
|
|
576
|
+
model=self,
|
|
577
|
+
state_dict=adapter_state_dict,
|
|
578
|
+
checkpoint_files=checkpoint_files,
|
|
579
|
+
load_config=load_config,
|
|
580
|
+
)
|
|
354
581
|
|
|
355
|
-
|
|
356
|
-
|
|
582
|
+
adapter_key_markers = {adapter_name}
|
|
583
|
+
if peft_config is not None and getattr(peft_config, "peft_type", None) is not None:
|
|
584
|
+
adapter_key_markers.add(peft_config.peft_type.value.lower())
|
|
357
585
|
|
|
358
|
-
|
|
359
|
-
|
|
586
|
+
def is_adapter_key(key: str) -> bool:
|
|
587
|
+
return any(marker in key for marker in adapter_key_markers)
|
|
360
588
|
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
offload_folder=offload_folder,
|
|
371
|
-
offload_index=offload_index,
|
|
372
|
-
)
|
|
589
|
+
loading_info.missing_keys = {k for k in loading_info.missing_keys if is_adapter_key(k)}
|
|
590
|
+
|
|
591
|
+
log_state_dict_report(
|
|
592
|
+
model=self,
|
|
593
|
+
pretrained_model_name_or_path=load_config.pretrained_model_name_or_path,
|
|
594
|
+
ignore_mismatched_sizes=load_config.ignore_mismatched_sizes,
|
|
595
|
+
loading_info=loading_info,
|
|
596
|
+
logger=logger,
|
|
597
|
+
)
|
|
373
598
|
|
|
374
599
|
def enable_peft_hotswap(
|
|
375
600
|
self, target_rank: int = 128, check_compiled: Literal["error", "warn", "ignore"] = "error"
|
|
@@ -732,3 +957,90 @@ def maybe_load_adapters(
|
|
|
732
957
|
pretrained_model_name_or_path = json.load(f)["base_model_name_or_path"]
|
|
733
958
|
|
|
734
959
|
return _adapter_model_path, pretrained_model_name_or_path, adapter_kwargs
|
|
960
|
+
|
|
961
|
+
|
|
962
|
+
#####################
|
|
963
|
+
# weight conversion #
|
|
964
|
+
#####################
|
|
965
|
+
|
|
966
|
+
# With transformers v5, we need to convert some weights to reflect updated model architectures. If users have trained
|
|
967
|
+
# PEFT adapters for these models, they also need to be updated. This may require updating the PEFT config too. The
|
|
968
|
+
# logic for this is found below. Right now, only LoRA is supported.
|
|
969
|
+
|
|
970
|
+
# TODO: These functions will be added to PEFT in release 0.19.0. Drop them here once 0.19.0 becomes the min PEFT
|
|
971
|
+
# version.
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
def _convert_peft_config_moe(peft_config, model_type: str):
|
|
975
|
+
base_model_type = _MODEL_TO_CONVERSION_PATTERN.get(model_type, None)
|
|
976
|
+
if base_model_type is None:
|
|
977
|
+
return peft_config
|
|
978
|
+
|
|
979
|
+
target_module_mapping = _MOE_TARGET_MODULE_MAPPING[base_model_type]
|
|
980
|
+
fused_targets = _MOE_FUSED_TARGETS.get(base_model_type, {})
|
|
981
|
+
|
|
982
|
+
peft_config.target_parameters = set(peft_config.target_parameters or [])
|
|
983
|
+
peft_config.target_modules = set(peft_config.target_modules or [])
|
|
984
|
+
if not hasattr(peft_config, "rank_pattern") or peft_config.rank_pattern is None:
|
|
985
|
+
peft_config.rank_pattern = {}
|
|
986
|
+
|
|
987
|
+
new_target_parameters = peft_config.target_parameters.copy()
|
|
988
|
+
remaining_target_modules = set()
|
|
989
|
+
matched_targets: dict[str, set[str]] = {new_name: set() for new_name in fused_targets}
|
|
990
|
+
|
|
991
|
+
for target in peft_config.target_modules:
|
|
992
|
+
mapped_new_name = None
|
|
993
|
+
mapped_old_name = None
|
|
994
|
+
for old_name, new_name in target_module_mapping.items():
|
|
995
|
+
if (target == old_name) or target.endswith(f".{old_name}"):
|
|
996
|
+
mapped_new_name = new_name
|
|
997
|
+
mapped_old_name = old_name
|
|
998
|
+
break
|
|
999
|
+
|
|
1000
|
+
if mapped_new_name is None:
|
|
1001
|
+
remaining_target_modules.add(target)
|
|
1002
|
+
continue
|
|
1003
|
+
|
|
1004
|
+
new_target_parameters.add(mapped_new_name)
|
|
1005
|
+
if mapped_new_name in fused_targets and mapped_old_name is not None:
|
|
1006
|
+
matched_targets.setdefault(mapped_new_name, set()).add(mapped_old_name)
|
|
1007
|
+
|
|
1008
|
+
for new_name, required_old_targets in fused_targets.items():
|
|
1009
|
+
present_targets = matched_targets.get(new_name, set())
|
|
1010
|
+
if 0 < len(present_targets) < len(required_old_targets):
|
|
1011
|
+
missing = ", ".join(sorted(required_old_targets - present_targets))
|
|
1012
|
+
present = ", ".join(sorted(present_targets))
|
|
1013
|
+
raise ValueError(
|
|
1014
|
+
f"Cannot convert PEFT target(s) {present} without also targeting {missing} because they are fused into {new_name}."
|
|
1015
|
+
)
|
|
1016
|
+
|
|
1017
|
+
if len(present_targets) == len(required_old_targets) and len(required_old_targets) > 1:
|
|
1018
|
+
peft_config.rank_pattern[rf".*\.{re.escape(new_name)}"] = peft_config.r * len(required_old_targets)
|
|
1019
|
+
|
|
1020
|
+
peft_config.target_parameters = new_target_parameters
|
|
1021
|
+
peft_config.target_modules = remaining_target_modules
|
|
1022
|
+
|
|
1023
|
+
return peft_config
|
|
1024
|
+
|
|
1025
|
+
|
|
1026
|
+
def convert_peft_config_for_transformers(peft_config, model: torch.nn.Module, conversions: list[Any] | None):
|
|
1027
|
+
# FIXME document this properly
|
|
1028
|
+
# If, for any reason, we cannot apply conversion, we just return the PEFT config as is.
|
|
1029
|
+
from peft import PeftType # avoid circular import
|
|
1030
|
+
|
|
1031
|
+
if peft_config.peft_type != PeftType.LORA:
|
|
1032
|
+
# weight conversion is currently only supported for LoRA
|
|
1033
|
+
return peft_config
|
|
1034
|
+
if not hasattr(model, "config"):
|
|
1035
|
+
# not a transformer model
|
|
1036
|
+
return peft_config
|
|
1037
|
+
if not hasattr(model.config, "model_type"):
|
|
1038
|
+
# not a transformer model
|
|
1039
|
+
return peft_config
|
|
1040
|
+
|
|
1041
|
+
peft_config = copy.deepcopy(peft_config) # don't mutate the original config
|
|
1042
|
+
model_type = getattr(model.config, "model_type", None)
|
|
1043
|
+
if get_checkpoint_conversion_mapping(model_type) is not None:
|
|
1044
|
+
peft_config = _convert_peft_config_moe(peft_config, model_type)
|
|
1045
|
+
|
|
1046
|
+
return peft_config
|