transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
# Copyright 2026 The LG AI Research and HuggingFace Inc. team. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""LG AI Research EXAONE Lab"""
|
|
16
|
+
|
|
17
|
+
import torch
|
|
18
|
+
import torch.nn as nn
|
|
19
|
+
|
|
20
|
+
from ... import initialization as init
|
|
21
|
+
from ...cache_utils import Cache
|
|
22
|
+
from ...configuration_utils import PreTrainedConfig, layer_type_validation
|
|
23
|
+
from ...modeling_outputs import CausalLMOutputWithPast
|
|
24
|
+
from ...modeling_utils import PreTrainedModel
|
|
25
|
+
from ...processing_utils import Unpack
|
|
26
|
+
from ...utils import TransformersKwargs, is_grouped_mm_available
|
|
27
|
+
from ..deepseek_v3.modeling_deepseek_v3 import (
|
|
28
|
+
DeepseekV3MoE,
|
|
29
|
+
DeepseekV3NaiveMoe,
|
|
30
|
+
DeepseekV3TopkRouter,
|
|
31
|
+
)
|
|
32
|
+
from ..exaone4.configuration_exaone4 import Exaone4Config
|
|
33
|
+
from ..exaone4.modeling_exaone4 import (
|
|
34
|
+
Exaone4Attention,
|
|
35
|
+
Exaone4ForCausalLM,
|
|
36
|
+
Exaone4Model,
|
|
37
|
+
Exaone4PreTrainedModel,
|
|
38
|
+
)
|
|
39
|
+
from ..olmoe.modeling_olmoe import (
|
|
40
|
+
OlmoeDecoderLayer,
|
|
41
|
+
)
|
|
42
|
+
from ..qwen2_moe.modeling_qwen2_moe import Qwen2MoeMLP
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ExaoneMoeConfig(Exaone4Config):
|
|
46
|
+
model_type = "exaone_moe"
|
|
47
|
+
|
|
48
|
+
r"""
|
|
49
|
+
This is the configuration class to store the configuration of a [`ExaoneMoeModel`]. It is used to
|
|
50
|
+
instantiate a EXAONE MoE model according to the specified arguments, defining the model architecture. Instantiating a
|
|
51
|
+
configuration with the defaults will yield a similar configuration to that of the K-EXAONE-236B-A23B [LGAI-EXAONE/K-EXAONE-236B-A23B](https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B)
|
|
52
|
+
|
|
53
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model
|
|
54
|
+
outputs. Read the documentation from [`PreTrainedConfig`] for more information.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
vocab_size (`int`, *optional*, defaults to 102400):
|
|
58
|
+
Vocabulary size of the EXAONE MoE model. Defines the number of different tokens that can be represented by the
|
|
59
|
+
`inputs_ids` passed when calling [`ExaoneMoeModel`].
|
|
60
|
+
hidden_size (`int`, *optional*, defaults to 4096):
|
|
61
|
+
Dimension of the hidden representations.
|
|
62
|
+
intermediate_size (`int`, *optional*, defaults to 16384):
|
|
63
|
+
Dimensionality of the MLP representations.
|
|
64
|
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
|
65
|
+
Number of hidden layers in the Transformer encoder.
|
|
66
|
+
num_attention_heads (`int`, *optional*, defaults to 32):
|
|
67
|
+
Number of attention heads for each attention layer in the Transformer decoder.
|
|
68
|
+
num_key_value_heads (`int`, *optional*, defaults to 32):
|
|
69
|
+
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
|
70
|
+
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
|
71
|
+
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
|
72
|
+
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
|
73
|
+
by meanpooling all the original heads within that group. For more details checkout [this
|
|
74
|
+
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
|
|
75
|
+
`num_attention_heads`.
|
|
76
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
|
77
|
+
The non-linear activation function (function or string) in the decoder.
|
|
78
|
+
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
|
79
|
+
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
|
80
|
+
just in case (e.g., 32768 for EXAONE 3.5).
|
|
81
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
82
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
83
|
+
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
|
84
|
+
The epsilon used by the layer normalization layers.
|
|
85
|
+
use_cache (`bool`, *optional*, defaults to `True`):
|
|
86
|
+
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
|
87
|
+
relevant if ``config.is_decoder=True``.
|
|
88
|
+
bos_token_id (`int`, *optional*, defaults to 1):
|
|
89
|
+
Beginning of stream token id.
|
|
90
|
+
eos_token_id (`int`, *optional*, defaults to 53):
|
|
91
|
+
End of stream token id.
|
|
92
|
+
pad_token_id (`int`, *optional*, defaults to 0):
|
|
93
|
+
Padding token id.
|
|
94
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
95
|
+
Whether to tie weight embeddings
|
|
96
|
+
rope_parameters (`RopeParameters`, *optional*):
|
|
97
|
+
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
|
|
98
|
+
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
|
|
99
|
+
with longer `max_position_embeddings`.
|
|
100
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
101
|
+
The dropout ratio for the attention probabilities.
|
|
102
|
+
sliding_window (`int`, *optional*, defaults to 4096):
|
|
103
|
+
The size of the sliding window for the sliding window attention.
|
|
104
|
+
sliding_window_pattern (`str`, *optional*, defaults to 4):
|
|
105
|
+
The pattern to use for sliding window attention. Can be one of:
|
|
106
|
+
- `None`: No sliding window attention is used
|
|
107
|
+
- `int`: Every `sliding_window` layers, use global attention, else use local attention.
|
|
108
|
+
- `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the
|
|
109
|
+
attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The
|
|
110
|
+
final layer always uses global attention regardless of the pattern.
|
|
111
|
+
For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means:
|
|
112
|
+
- Layer 0, 1, 2: local attention,
|
|
113
|
+
- Layer 3: global attention,
|
|
114
|
+
...(repeated)
|
|
115
|
+
layer_types (`list`, *optional*):
|
|
116
|
+
Attention pattern for each layer. Prioritized over `sliding_window_pattern`.
|
|
117
|
+
mlp_layer_types (`list`, *optional*):
|
|
118
|
+
MLP pattern for each layer. Prioritized over `first_k_dense_replace`.
|
|
119
|
+
first_k_dense_replace (`int`, *optional*, defaults to 1):
|
|
120
|
+
Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
|
|
121
|
+
\--k dense layers--/
|
|
122
|
+
moe_intermediate_size (`int`, *optional*, defaults to 1024):
|
|
123
|
+
Dimension of the MoE representations.
|
|
124
|
+
num_experts (`int`, *optional*, defaults to 64):
|
|
125
|
+
Number of routed experts.
|
|
126
|
+
num_experts_per_tok (`int`, *optional*, defaults to 8):
|
|
127
|
+
Number of selected experts, None means dense model.
|
|
128
|
+
num_shared_experts (`int`, *optional*, defaults to 1):
|
|
129
|
+
Number of shared experts.
|
|
130
|
+
norm_topk_prob (`bool`, *optional*, defaults to `True`):
|
|
131
|
+
Whether to normalize the weights of the routed experts.
|
|
132
|
+
routed_scaling_factor (`float`, *optional*, defaults to 2.5):
|
|
133
|
+
Scaling factor or routed experts.
|
|
134
|
+
n_group (`int`, *optional*, defaults to 1):
|
|
135
|
+
Number of groups for routed experts.
|
|
136
|
+
topk_group (`int`, *optional*, defaults to 1):
|
|
137
|
+
Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
|
|
138
|
+
|
|
139
|
+
Example:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
>>> from transformers import ExaoneMoeModel, ExaoneMoeConfig
|
|
143
|
+
|
|
144
|
+
>>> # Initializing a EXAONE configuration
|
|
145
|
+
>>> configuration = ExaoneMoeConfig()
|
|
146
|
+
|
|
147
|
+
>>> # Initializing a model from configuration
|
|
148
|
+
>>> model = ExaoneMoeModel(configuration)
|
|
149
|
+
|
|
150
|
+
>>> # Accessing the model configuration
|
|
151
|
+
>>> configuration = model.config
|
|
152
|
+
```"""
|
|
153
|
+
|
|
154
|
+
def __init__(
|
|
155
|
+
self,
|
|
156
|
+
vocab_size=102400,
|
|
157
|
+
hidden_size=4096,
|
|
158
|
+
intermediate_size=16384,
|
|
159
|
+
num_hidden_layers=32,
|
|
160
|
+
num_attention_heads=32,
|
|
161
|
+
num_key_value_heads=32,
|
|
162
|
+
hidden_act="silu",
|
|
163
|
+
max_position_embeddings=2048,
|
|
164
|
+
initializer_range=0.02,
|
|
165
|
+
rms_norm_eps=1e-5,
|
|
166
|
+
use_cache=True,
|
|
167
|
+
bos_token_id=1,
|
|
168
|
+
eos_token_id=53,
|
|
169
|
+
pad_token_id=0,
|
|
170
|
+
tie_word_embeddings=False,
|
|
171
|
+
rope_parameters=None,
|
|
172
|
+
attention_dropout=0.0,
|
|
173
|
+
sliding_window=4096,
|
|
174
|
+
sliding_window_pattern=4,
|
|
175
|
+
layer_types=None,
|
|
176
|
+
mlp_layer_types=None,
|
|
177
|
+
first_k_dense_replace=1,
|
|
178
|
+
moe_intermediate_size=1024,
|
|
179
|
+
num_experts=64,
|
|
180
|
+
num_experts_per_tok=8,
|
|
181
|
+
num_shared_experts=1,
|
|
182
|
+
norm_topk_prob=True,
|
|
183
|
+
routed_scaling_factor=2.5,
|
|
184
|
+
n_group=1,
|
|
185
|
+
topk_group=1,
|
|
186
|
+
**kwargs,
|
|
187
|
+
):
|
|
188
|
+
self.vocab_size = vocab_size
|
|
189
|
+
self.hidden_size = hidden_size
|
|
190
|
+
self.num_hidden_layers = num_hidden_layers
|
|
191
|
+
self.num_attention_heads = num_attention_heads
|
|
192
|
+
self.num_key_value_heads = num_key_value_heads
|
|
193
|
+
self.intermediate_size = intermediate_size
|
|
194
|
+
self.hidden_act = hidden_act
|
|
195
|
+
self.max_position_embeddings = max_position_embeddings
|
|
196
|
+
self.initializer_range = initializer_range
|
|
197
|
+
self.rms_norm_eps = rms_norm_eps
|
|
198
|
+
self.use_cache = use_cache
|
|
199
|
+
self.attention_dropout = attention_dropout
|
|
200
|
+
self.sliding_window = sliding_window
|
|
201
|
+
self.sliding_window_pattern = sliding_window_pattern
|
|
202
|
+
self.first_k_dense_replace = first_k_dense_replace
|
|
203
|
+
self.moe_intermediate_size = moe_intermediate_size
|
|
204
|
+
self.num_experts = num_experts
|
|
205
|
+
self.num_experts_per_tok = num_experts_per_tok
|
|
206
|
+
self.num_shared_experts = num_shared_experts
|
|
207
|
+
self.norm_topk_prob = norm_topk_prob
|
|
208
|
+
self.routed_scaling_factor = routed_scaling_factor
|
|
209
|
+
self.n_group = n_group
|
|
210
|
+
self.topk_group = topk_group
|
|
211
|
+
self.rope_parameters = rope_parameters
|
|
212
|
+
|
|
213
|
+
self.layer_types = layer_types
|
|
214
|
+
if self.sliding_window is None:
|
|
215
|
+
sliding_window_pattern = 0
|
|
216
|
+
if self.layer_types is None:
|
|
217
|
+
self.layer_types = [
|
|
218
|
+
"sliding_attention"
|
|
219
|
+
if ((i + 1) % (sliding_window_pattern) != 0 and i < self.num_hidden_layers)
|
|
220
|
+
else "full_attention"
|
|
221
|
+
for i in range(self.num_hidden_layers)
|
|
222
|
+
]
|
|
223
|
+
layer_type_validation(self.layer_types)
|
|
224
|
+
|
|
225
|
+
self.mlp_layer_types = mlp_layer_types
|
|
226
|
+
if self.mlp_layer_types is None:
|
|
227
|
+
self.mlp_layer_types = [
|
|
228
|
+
"dense" if i < self.first_k_dense_replace else "sparse" for i in range(self.num_hidden_layers)
|
|
229
|
+
]
|
|
230
|
+
layer_type_validation(self.mlp_layer_types, self.num_hidden_layers, attention=False)
|
|
231
|
+
|
|
232
|
+
self.bos_token_id = bos_token_id
|
|
233
|
+
self.eos_token_id = eos_token_id
|
|
234
|
+
self.pad_token_id = pad_token_id
|
|
235
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
236
|
+
|
|
237
|
+
PreTrainedConfig.__init__(**kwargs)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
class ExaoneMoeAttention(Exaone4Attention):
|
|
241
|
+
pass
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
class ExaoneMoeMLP(Qwen2MoeMLP):
|
|
245
|
+
pass
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class ExaoneMoeTopkRouter(DeepseekV3TopkRouter):
|
|
249
|
+
def __init__(self, config):
|
|
250
|
+
nn.Module.__init__()
|
|
251
|
+
self.config = config
|
|
252
|
+
self.weight = nn.Parameter(torch.empty((config.num_experts, config.hidden_size)))
|
|
253
|
+
self.register_buffer("e_score_correction_bias", torch.zeros(config.num_experts))
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
class ExaoneMoeExperts(DeepseekV3NaiveMoe):
|
|
257
|
+
def __init__(self, config):
|
|
258
|
+
super().__init__(config)
|
|
259
|
+
self.num_experts = config.num_experts
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
class ExaoneMoeSparseMoEBlock(DeepseekV3MoE):
|
|
263
|
+
def __init__(self, config):
|
|
264
|
+
super().__init__()
|
|
265
|
+
self.experts = ExaoneMoeExperts(config)
|
|
266
|
+
self.shared_experts = ExaoneMoeMLP(
|
|
267
|
+
config=config, intermediate_size=config.moe_intermediate_size * config.num_shared_experts
|
|
268
|
+
)
|
|
269
|
+
self.n_routed_experts = config.num_experts
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class ExaoneMoeDecoderLayer(OlmoeDecoderLayer):
|
|
273
|
+
def __init__(self, config: ExaoneMoeConfig, layer_idx: int):
|
|
274
|
+
super().__init__(config, layer_idx)
|
|
275
|
+
self.mlp = (
|
|
276
|
+
ExaoneMoeSparseMoEBlock(config) if config.mlp_layer_types[layer_idx] == "sparse" else ExaoneMoeMLP(config)
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
class ExaoneMoePreTrainedModel(Exaone4PreTrainedModel):
|
|
281
|
+
config: ExaoneMoeConfig
|
|
282
|
+
|
|
283
|
+
_can_record_outputs = {
|
|
284
|
+
"hidden_states": ExaoneMoeDecoderLayer,
|
|
285
|
+
"attentions": ExaoneMoeAttention,
|
|
286
|
+
"router_logits": ExaoneMoeSparseMoEBlock,
|
|
287
|
+
}
|
|
288
|
+
_can_compile_fullgraph = (
|
|
289
|
+
is_grouped_mm_available()
|
|
290
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
291
|
+
_keep_in_fp32_modules_strict = ["e_score_correction_bias"]
|
|
292
|
+
_keys_to_ignore_on_load_unexpected = [r"mtp.*"]
|
|
293
|
+
|
|
294
|
+
@torch.no_grad()
|
|
295
|
+
def _init_weights(self, module):
|
|
296
|
+
PreTrainedModel._init_weights(self, module)
|
|
297
|
+
if isinstance(module, ExaoneMoeTopkRouter):
|
|
298
|
+
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
|
|
299
|
+
init.zeros_(module.e_score_correction_bias)
|
|
300
|
+
elif isinstance(module, ExaoneMoeExperts):
|
|
301
|
+
init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
|
|
302
|
+
init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class ExaoneMoeModel(Exaone4Model):
|
|
306
|
+
pass
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
class ExaoneMoeForCausalLM(Exaone4ForCausalLM):
|
|
310
|
+
def forward(
|
|
311
|
+
self,
|
|
312
|
+
input_ids: torch.LongTensor | None = None,
|
|
313
|
+
attention_mask: torch.Tensor | None = None,
|
|
314
|
+
position_ids: torch.LongTensor | None = None,
|
|
315
|
+
past_key_values: Cache | None = None,
|
|
316
|
+
inputs_embeds: torch.FloatTensor | None = None,
|
|
317
|
+
labels: torch.LongTensor | None = None,
|
|
318
|
+
use_cache: bool | None = None,
|
|
319
|
+
cache_position: torch.LongTensor | None = None,
|
|
320
|
+
logits_to_keep: int | torch.Tensor = 0,
|
|
321
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
322
|
+
) -> CausalLMOutputWithPast:
|
|
323
|
+
r"""
|
|
324
|
+
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
325
|
+
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
|
326
|
+
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
|
327
|
+
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
|
328
|
+
|
|
329
|
+
Example:
|
|
330
|
+
|
|
331
|
+
```python
|
|
332
|
+
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
333
|
+
>>> model = AutoModelForCausalLM.from_pretrained("LGAI-EXAONE/K-EXAONE-236B-A23B")
|
|
334
|
+
>>> tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/K-EXAONE-236B-A23B")
|
|
335
|
+
|
|
336
|
+
>>> prompt = "Explain how wonderful you are"
|
|
337
|
+
>>> messages = [
|
|
338
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
339
|
+
{"role": "user", "content": prompt}
|
|
340
|
+
]
|
|
341
|
+
>>> input_ids = tokenizer.apply_chat_template(
|
|
342
|
+
messages,
|
|
343
|
+
tokenize=True,
|
|
344
|
+
add_generation_prompt=True,
|
|
345
|
+
return_tensors="pt",
|
|
346
|
+
enable_thinking=False,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
>>> output = model.generate(**input_ids.to(model.device), max_new_tokens=128)
|
|
350
|
+
>>> tokenizer.decode(output[0], skip_special_tokens=False)
|
|
351
|
+
"<|system|>\nYou are a helpful assistant.<|endofturn|>\n<|user|>\nExplain how wonderful you are<|endofturn|>\n<|assistant|>\n<think>\n\n</think>\n\nThank you for the kind question! While I can't feel emotions or take pride in the way humans do, I *can* share what makes me uniquely helpful and capable—qualities that many people find wonderful.\n\nHere’s how I can support you:\n\n🌟 **Knowledge at Your Fingertips** \nI have access to a vast amount of information across countless topics—from science and history to technology and creative writing. Whether you're curious, learning, or solving a problem, I can help explain things clearly and accurately.\n\n💬 **Clear, Helpful Communication** \nI aim to respond in a way that's easy to understand, whether you need a simple explanation or a detailed analysis. I adapt my tone and depth to match"
|
|
352
|
+
```
|
|
353
|
+
"""
|
|
354
|
+
super().forward(
|
|
355
|
+
input_ids=input_ids,
|
|
356
|
+
attention_mask=attention_mask,
|
|
357
|
+
position_ids=position_ids,
|
|
358
|
+
past_key_values=past_key_values,
|
|
359
|
+
inputs_embeds=inputs_embeds,
|
|
360
|
+
labels=labels,
|
|
361
|
+
use_cache=use_cache,
|
|
362
|
+
cache_position=cache_position,
|
|
363
|
+
logits_to_keep=logits_to_keep,
|
|
364
|
+
**kwargs,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
__all__ = [
|
|
369
|
+
"ExaoneMoeConfig",
|
|
370
|
+
"ExaoneMoePreTrainedModel",
|
|
371
|
+
"ExaoneMoeModel",
|
|
372
|
+
"ExaoneMoeForCausalLM",
|
|
373
|
+
]
|
|
@@ -82,11 +82,15 @@ class FalconConfig(PreTrainedConfig):
|
|
|
82
82
|
The id of the "beginning-of-sequence" token.
|
|
83
83
|
eos_token_id (`int`, *optional*, defaults to 11):
|
|
84
84
|
The id of the "end-of-sequence" token.
|
|
85
|
+
pad_token_id (`int`, *optional*):
|
|
86
|
+
Padding token id.
|
|
85
87
|
ffn_hidden_size (`int`, *optional*):
|
|
86
88
|
The hidden size of the feedforward layer in the Transformer decoder.
|
|
87
89
|
defaults to 4x hidden dim
|
|
88
90
|
activation (`str`, *optional*, defaults to `"gelu"`):
|
|
89
91
|
The activation function used in the feedforward layer.
|
|
92
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
|
93
|
+
Whether to tie weight embeddings
|
|
90
94
|
|
|
91
95
|
Example:
|
|
92
96
|
|
|
@@ -128,8 +132,10 @@ class FalconConfig(PreTrainedConfig):
|
|
|
128
132
|
rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
|
|
129
133
|
bos_token_id: int | None = 11,
|
|
130
134
|
eos_token_id: int | None = 11,
|
|
135
|
+
pad_token_id: int | None = None,
|
|
131
136
|
ffn_hidden_size: int | None = None,
|
|
132
137
|
activation: str | None = "gelu",
|
|
138
|
+
tie_word_embeddings: bool | None = True,
|
|
133
139
|
**kwargs,
|
|
134
140
|
):
|
|
135
141
|
self.vocab_size = vocab_size
|
|
@@ -145,6 +151,7 @@ class FalconConfig(PreTrainedConfig):
|
|
|
145
151
|
self.attention_dropout = attention_dropout
|
|
146
152
|
self.bos_token_id = bos_token_id
|
|
147
153
|
self.eos_token_id = eos_token_id
|
|
154
|
+
self.pad_token_id = pad_token_id
|
|
148
155
|
self.num_kv_heads = num_attention_heads if num_kv_heads is None else num_kv_heads
|
|
149
156
|
self.alibi = alibi
|
|
150
157
|
self.new_decoder_architecture = new_decoder_architecture
|
|
@@ -154,6 +161,7 @@ class FalconConfig(PreTrainedConfig):
|
|
|
154
161
|
self.num_ln_in_parallel_attn = num_ln_in_parallel_attn
|
|
155
162
|
self.max_position_embeddings = max_position_embeddings
|
|
156
163
|
self.activation = activation
|
|
164
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
157
165
|
if ffn_hidden_size is None:
|
|
158
166
|
self.ffn_hidden_size = hidden_size * 4
|
|
159
167
|
else:
|
|
@@ -161,7 +169,7 @@ class FalconConfig(PreTrainedConfig):
|
|
|
161
169
|
|
|
162
170
|
self.rope_parameters = rope_parameters
|
|
163
171
|
|
|
164
|
-
super().__init__(
|
|
172
|
+
super().__init__(**kwargs)
|
|
165
173
|
|
|
166
174
|
@property
|
|
167
175
|
def head_dim(self):
|
|
@@ -47,7 +47,7 @@ from ...utils import (
|
|
|
47
47
|
auto_docstring,
|
|
48
48
|
logging,
|
|
49
49
|
)
|
|
50
|
-
from ...utils.generic import maybe_autocast
|
|
50
|
+
from ...utils.generic import is_flash_attention_requested, maybe_autocast
|
|
51
51
|
from .configuration_falcon import FalconConfig
|
|
52
52
|
|
|
53
53
|
|
|
@@ -511,12 +511,7 @@ class FalconFlashAttention2(FalconAttention):
|
|
|
511
511
|
device_type = query_layer.device.type if query_layer.device.type != "mps" else "cpu"
|
|
512
512
|
if input_dtype == torch.float32:
|
|
513
513
|
if torch.is_autocast_enabled():
|
|
514
|
-
|
|
515
|
-
target_dtype = (
|
|
516
|
-
torch.get_autocast_dtype(device_type)
|
|
517
|
-
if hasattr(torch, "get_autocast_dtype")
|
|
518
|
-
else torch.get_autocast_gpu_dtype()
|
|
519
|
-
)
|
|
514
|
+
target_dtype = torch.get_autocast_dtype(device_type)
|
|
520
515
|
# Handle the case where the model is quantized
|
|
521
516
|
elif hasattr(self.config, "_is_quantized"):
|
|
522
517
|
target_dtype = self.config.dtype
|
|
@@ -859,7 +854,7 @@ class FalconModel(FalconPreTrainedModel):
|
|
|
859
854
|
# (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
|
|
860
855
|
# `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
|
|
861
856
|
|
|
862
|
-
if self.config
|
|
857
|
+
if is_flash_attention_requested(self.config):
|
|
863
858
|
if attention_mask is not None and 0.0 in attention_mask:
|
|
864
859
|
return attention_mask
|
|
865
860
|
return None
|
|
@@ -101,6 +101,12 @@ class FalconH1Config(PreTrainedConfig):
|
|
|
101
101
|
Whether to use RMSNorm before the gate in the Mamba block
|
|
102
102
|
mamba_rms_norm (`bool`, *optional*, defaults to `False`):
|
|
103
103
|
Whether to use RMSNorm instead of LayerNorm in the Mamba block
|
|
104
|
+
time_step_min (`float`, *optional*, defaults to 0.001):
|
|
105
|
+
Minimum `time_step` used to bound `dt_proj.bias`.
|
|
106
|
+
time_step_max (`float`, *optional*, defaults to 0.1):
|
|
107
|
+
Maximum `time_step` used to bound `dt_proj.bias`.
|
|
108
|
+
time_step_limit (`tuple`, *optional*, defaults to `(0.0, inf)`):
|
|
109
|
+
Accepted range of time step values for clamping.
|
|
104
110
|
projectors_bias (`bool`, *optional*, defaults to `False`):
|
|
105
111
|
Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the attention block
|
|
106
112
|
rope_parameters (`float`, *optional*):
|
|
@@ -160,6 +166,9 @@ class FalconH1Config(PreTrainedConfig):
|
|
|
160
166
|
mamba_proj_bias: bool | None = False,
|
|
161
167
|
mamba_norm_before_gate: bool | None = True,
|
|
162
168
|
mamba_rms_norm: bool | None = False,
|
|
169
|
+
time_step_min: float | None = 0.001,
|
|
170
|
+
time_step_max: float | None = 0.1,
|
|
171
|
+
time_step_limit: tuple[float, float] | None = (0.0, float("inf")),
|
|
163
172
|
projectors_bias: bool | None = False,
|
|
164
173
|
rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
|
|
165
174
|
lm_head_multiplier: float | None = 1.0,
|
|
@@ -220,6 +229,9 @@ class FalconH1Config(PreTrainedConfig):
|
|
|
220
229
|
|
|
221
230
|
self.mamba_norm_before_gate = mamba_norm_before_gate
|
|
222
231
|
self.mamba_rms_norm = mamba_rms_norm
|
|
232
|
+
self.time_step_min = time_step_min
|
|
233
|
+
self.time_step_max = time_step_max
|
|
234
|
+
self.time_step_limit = tuple(time_step_limit) if time_step_limit is not None else None
|
|
223
235
|
|
|
224
236
|
self.lm_head_multiplier = lm_head_multiplier
|
|
225
237
|
self.embedding_multiplier = embedding_multiplier
|
|
@@ -259,15 +271,12 @@ class FalconH1Config(PreTrainedConfig):
|
|
|
259
271
|
else:
|
|
260
272
|
self.ssm_out_multiplier = 1.0
|
|
261
273
|
|
|
274
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
275
|
+
self.pad_token_id = pad_token_id
|
|
276
|
+
self.bos_token_id = bos_token_id
|
|
277
|
+
self.eos_token_id = eos_token_id
|
|
262
278
|
self.rope_parameters = rope_parameters
|
|
263
|
-
|
|
264
|
-
super().__init__(
|
|
265
|
-
pad_token_id=pad_token_id,
|
|
266
|
-
bos_token_id=bos_token_id,
|
|
267
|
-
eos_token_id=eos_token_id,
|
|
268
|
-
tie_word_embeddings=tie_word_embeddings,
|
|
269
|
-
**kwargs,
|
|
270
|
-
)
|
|
279
|
+
super().__init__(**kwargs)
|
|
271
280
|
|
|
272
281
|
@property
|
|
273
282
|
def layers_block_type(self):
|
|
@@ -44,7 +44,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
44
44
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
45
45
|
from ...processing_utils import Unpack
|
|
46
46
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
|
|
47
|
-
from ...utils.generic import maybe_autocast
|
|
47
|
+
from ...utils.generic import is_flash_attention_requested, maybe_autocast
|
|
48
48
|
from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
|
|
49
49
|
from .configuration_falcon_h1 import FalconH1Config
|
|
50
50
|
|
|
@@ -411,9 +411,9 @@ class FalconH1Attention(nn.Module):
|
|
|
411
411
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
|
412
412
|
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
|
413
413
|
|
|
414
|
-
attention_interface: Callable =
|
|
415
|
-
|
|
416
|
-
|
|
414
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
415
|
+
self.config._attn_implementation, eager_attention_forward
|
|
416
|
+
)
|
|
417
417
|
|
|
418
418
|
attn_output, attn_weights = attention_interface(
|
|
419
419
|
self,
|
|
@@ -567,10 +567,9 @@ class FalconH1Mixer(nn.Module):
|
|
|
567
567
|
self.head_dim = config.mamba_d_head
|
|
568
568
|
self.chunk_size = config.mamba_chunk_size
|
|
569
569
|
|
|
570
|
-
|
|
571
|
-
self.
|
|
572
|
-
self.
|
|
573
|
-
self.time_step_max = 0.1
|
|
570
|
+
self.time_step_limit = config.time_step_limit
|
|
571
|
+
self.time_step_min = config.time_step_min
|
|
572
|
+
self.time_step_max = config.time_step_max
|
|
574
573
|
|
|
575
574
|
self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
|
|
576
575
|
self.conv1d = nn.Conv1d(
|
|
@@ -1038,7 +1037,7 @@ class FalconH1Mixer(nn.Module):
|
|
|
1038
1037
|
cache_position: torch.LongTensor | None = None,
|
|
1039
1038
|
attention_mask: torch.Tensor | None = None,
|
|
1040
1039
|
):
|
|
1041
|
-
if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
|
|
1040
|
+
if is_fast_path_available and "cuda" in self.in_proj.weight.device.type and not is_torchdynamo_compiling():
|
|
1042
1041
|
return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask)
|
|
1043
1042
|
dtype = hidden_states.dtype
|
|
1044
1043
|
if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
|
|
@@ -1389,7 +1388,7 @@ class FalconH1Model(FalconH1PreTrainedModel):
|
|
|
1389
1388
|
past_key_values: FalconHybridMambaAttentionDynamicCache,
|
|
1390
1389
|
output_attentions: bool,
|
|
1391
1390
|
):
|
|
1392
|
-
if self.config
|
|
1391
|
+
if is_flash_attention_requested(self.config):
|
|
1393
1392
|
if attention_mask is not None and 0.0 in attention_mask:
|
|
1394
1393
|
return attention_mask
|
|
1395
1394
|
return None
|
|
@@ -1501,7 +1500,7 @@ class FalconH1Model(FalconH1PreTrainedModel):
|
|
|
1501
1500
|
@auto_docstring
|
|
1502
1501
|
class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin):
|
|
1503
1502
|
_tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
|
|
1504
|
-
_tp_plan = {"lm_head": "
|
|
1503
|
+
_tp_plan = {"lm_head": "colwise_gather_output"}
|
|
1505
1504
|
_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
|
|
1506
1505
|
|
|
1507
1506
|
def __init__(self, config):
|
|
@@ -1597,22 +1596,7 @@ class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin):
|
|
|
1597
1596
|
):
|
|
1598
1597
|
# Overwritten -- has a unique cache type, `FalconHybridMambaAttentionDynamicCache`
|
|
1599
1598
|
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
# If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
|
|
1603
|
-
# Exception 1: when passing input_embeds, input_ids may be missing entries
|
|
1604
|
-
# Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
|
|
1605
|
-
# Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
|
|
1606
|
-
# (we can't check exception 3 while compiling)
|
|
1607
|
-
if not empty_past_kv:
|
|
1608
|
-
if (
|
|
1609
|
-
inputs_embeds is not None # Exception 1
|
|
1610
|
-
or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1]) # Exception 3
|
|
1611
|
-
):
|
|
1612
|
-
input_ids = input_ids[:, -cache_position.shape[0] :]
|
|
1613
|
-
elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
|
|
1614
|
-
input_ids = input_ids[:, cache_position]
|
|
1615
|
-
else:
|
|
1599
|
+
if past_key_values is None:
|
|
1616
1600
|
past_key_values = FalconHybridMambaAttentionDynamicCache(
|
|
1617
1601
|
self.config,
|
|
1618
1602
|
input_ids.shape[0],
|
|
@@ -1622,35 +1606,19 @@ class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin):
|
|
|
1622
1606
|
],
|
|
1623
1607
|
)
|
|
1624
1608
|
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases
|
|
1637
|
-
|
|
1638
|
-
model_inputs.update(
|
|
1639
|
-
{
|
|
1640
|
-
"position_ids": position_ids,
|
|
1641
|
-
"past_key_values": past_key_values,
|
|
1642
|
-
"use_cache": use_cache,
|
|
1643
|
-
"attention_mask": attention_mask,
|
|
1644
|
-
"logits_to_keep": self.config.num_logits_to_keep,
|
|
1645
|
-
"cache_position": cache_position,
|
|
1646
|
-
}
|
|
1609
|
+
kwargs["logits_to_keep"] = self.config.num_logits_to_keep
|
|
1610
|
+
model_inputs = super().prepare_inputs_for_generation(
|
|
1611
|
+
input_ids,
|
|
1612
|
+
past_key_values=past_key_values,
|
|
1613
|
+
attention_mask=attention_mask,
|
|
1614
|
+
inputs_embeds=inputs_embeds,
|
|
1615
|
+
cache_position=cache_position,
|
|
1616
|
+
position_ids=position_ids,
|
|
1617
|
+
use_cache=use_cache,
|
|
1618
|
+
is_first_iteration=is_first_iteration,
|
|
1619
|
+
**kwargs,
|
|
1647
1620
|
)
|
|
1648
1621
|
|
|
1649
|
-
# Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
|
|
1650
|
-
for key, value in kwargs.items():
|
|
1651
|
-
if key not in model_inputs:
|
|
1652
|
-
model_inputs[key] = value
|
|
1653
|
-
|
|
1654
1622
|
return model_inputs
|
|
1655
1623
|
|
|
1656
1624
|
|