transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -61,8 +61,6 @@ class Glm4vMoeTextConfig(PreTrainedConfig):
|
|
|
61
61
|
use_cache (`bool`, *optional*, defaults to `True`):
|
|
62
62
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
|
63
63
|
relevant if `config.is_decoder=True`.
|
|
64
|
-
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
65
|
-
Whether the model's input and output word embeddings should be tied.
|
|
66
64
|
rope_parameters (`RopeParameters`, *optional*):
|
|
67
65
|
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
|
|
68
66
|
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
|
|
@@ -90,8 +88,15 @@ class Glm4vMoeTextConfig(PreTrainedConfig):
|
|
|
90
88
|
\--k dense layers--/
|
|
91
89
|
norm_topk_prob (`bool`, *optional*, defaults to `True`):
|
|
92
90
|
Whether to normalize the topk probabilities.
|
|
91
|
+
pad_token_id (`int`, *optional*):
|
|
92
|
+
Padding token id.
|
|
93
|
+
eos_token_id (`int`, *optional*):
|
|
94
|
+
End of stream token id.
|
|
95
|
+
bos_token_id (`int`, *optional*):
|
|
96
|
+
Beginning of stream token id.
|
|
93
97
|
router_aux_loss_coef (`float`, *optional*, defaults to 0.0001):
|
|
94
98
|
The aux loss factor for the loss.
|
|
99
|
+
|
|
95
100
|
```python
|
|
96
101
|
>>> from transformers import Glm4vMoeTextModel, Glm4vMoeConfig
|
|
97
102
|
|
|
@@ -140,7 +145,6 @@ class Glm4vMoeTextConfig(PreTrainedConfig):
|
|
|
140
145
|
initializer_range: float | None = 0.02,
|
|
141
146
|
rms_norm_eps: int | None = 1e-5,
|
|
142
147
|
use_cache: bool | None = True,
|
|
143
|
-
tie_word_embeddings: bool | None = False,
|
|
144
148
|
rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
|
|
145
149
|
attention_bias: bool | None = True,
|
|
146
150
|
attention_dropout: float | None = 0.0,
|
|
@@ -153,9 +157,15 @@ class Glm4vMoeTextConfig(PreTrainedConfig):
|
|
|
153
157
|
topk_group: int | None = 1,
|
|
154
158
|
first_k_dense_replace: int | None = 1,
|
|
155
159
|
norm_topk_prob: bool | None = True,
|
|
160
|
+
pad_token_id: int | None = None,
|
|
161
|
+
eos_token_id: int | None = None,
|
|
162
|
+
bos_token_id: int | None = None,
|
|
156
163
|
router_aux_loss_coef: float | None = 0.0001,
|
|
157
164
|
**kwargs,
|
|
158
165
|
):
|
|
166
|
+
self.pad_token_id = pad_token_id
|
|
167
|
+
self.eos_token_id = eos_token_id
|
|
168
|
+
self.bos_token_id = bos_token_id
|
|
159
169
|
self.vocab_size = vocab_size
|
|
160
170
|
self.max_position_embeddings = max_position_embeddings
|
|
161
171
|
self.hidden_size = hidden_size
|
|
@@ -184,9 +194,7 @@ class Glm4vMoeTextConfig(PreTrainedConfig):
|
|
|
184
194
|
self.first_k_dense_replace = first_k_dense_replace
|
|
185
195
|
self.norm_topk_prob = norm_topk_prob
|
|
186
196
|
self.router_aux_loss_coef = router_aux_loss_coef
|
|
187
|
-
super().__init__(
|
|
188
|
-
tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
|
|
189
|
-
)
|
|
197
|
+
super().__init__(ignore_keys_at_rope_validation={"mrope_section"}, **kwargs)
|
|
190
198
|
|
|
191
199
|
|
|
192
200
|
class Glm4vMoeVisionConfig(PreTrainedConfig):
|
|
@@ -310,6 +318,8 @@ class Glm4vMoeConfig(PreTrainedConfig):
|
|
|
310
318
|
The video start token index to encode the start of video.
|
|
311
319
|
video_end_token_id (`int`, *optional*, defaults to 151342):
|
|
312
320
|
The video end token index to encode the end of video.
|
|
321
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
322
|
+
Whether the model's input and output word embeddings should be tied.
|
|
313
323
|
|
|
314
324
|
```python
|
|
315
325
|
>>> from transformers import Glm4vMoeForConditionalGeneration, Glm4vMoeConfig
|
|
@@ -338,6 +348,7 @@ class Glm4vMoeConfig(PreTrainedConfig):
|
|
|
338
348
|
image_end_token_id=151340,
|
|
339
349
|
video_start_token_id=151341,
|
|
340
350
|
video_end_token_id=151342,
|
|
351
|
+
tie_word_embeddings=False,
|
|
341
352
|
**kwargs,
|
|
342
353
|
):
|
|
343
354
|
if isinstance(vision_config, dict):
|
|
@@ -356,6 +367,7 @@ class Glm4vMoeConfig(PreTrainedConfig):
|
|
|
356
367
|
self.video_end_token_id = video_end_token_id
|
|
357
368
|
self.image_start_token_id = image_start_token_id
|
|
358
369
|
self.image_end_token_id = image_end_token_id
|
|
370
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
359
371
|
|
|
360
372
|
super().__init__(**kwargs)
|
|
361
373
|
|
|
@@ -35,7 +35,7 @@ from ...integrations import use_experts_implementation, use_kernel_forward_from_
|
|
|
35
35
|
from ...masking_utils import create_causal_mask
|
|
36
36
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
37
37
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
38
|
-
from ...modeling_outputs import ModelOutput, MoeModelOutputWithPast
|
|
38
|
+
from ...modeling_outputs import BaseModelOutputWithPooling, ModelOutput, MoeModelOutputWithPast
|
|
39
39
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
40
40
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
41
41
|
from ...processing_utils import Unpack
|
|
@@ -45,8 +45,9 @@ from ...utils import (
|
|
|
45
45
|
can_return_tuple,
|
|
46
46
|
is_grouped_mm_available,
|
|
47
47
|
is_torchdynamo_compiling,
|
|
48
|
+
torch_compilable_check,
|
|
48
49
|
)
|
|
49
|
-
from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
|
|
50
|
+
from ...utils.generic import OutputRecorder, check_model_inputs, is_flash_attention_requested, maybe_autocast
|
|
50
51
|
from .configuration_glm4v_moe import Glm4vMoeConfig, Glm4vMoeTextConfig, Glm4vMoeVisionConfig
|
|
51
52
|
|
|
52
53
|
|
|
@@ -185,9 +186,9 @@ class Glm4vMoeTextAttention(nn.Module):
|
|
|
185
186
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
|
186
187
|
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
|
187
188
|
|
|
188
|
-
attention_interface: Callable =
|
|
189
|
-
|
|
190
|
-
|
|
189
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
190
|
+
self.config._attn_implementation, eager_attention_forward
|
|
191
|
+
)
|
|
191
192
|
|
|
192
193
|
attn_output, attn_weights = attention_interface(
|
|
193
194
|
self,
|
|
@@ -684,11 +685,11 @@ class Glm4vMoeVisionAttention(nn.Module):
|
|
|
684
685
|
key_states = key_states.transpose(0, 1).unsqueeze(0)
|
|
685
686
|
value_states = value_states.transpose(0, 1).unsqueeze(0)
|
|
686
687
|
|
|
687
|
-
attention_interface: Callable =
|
|
688
|
-
|
|
689
|
-
|
|
688
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
689
|
+
self.config._attn_implementation, eager_attention_forward
|
|
690
|
+
)
|
|
690
691
|
|
|
691
|
-
if
|
|
692
|
+
if is_flash_attention_requested(self.config):
|
|
692
693
|
# Flash Attention: Use cu_seqlens for variable length attention
|
|
693
694
|
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
|
|
694
695
|
attn_output, _ = attention_interface(
|
|
@@ -766,6 +767,10 @@ class Glm4vMoeVisionModel(Glm4vMoePreTrainedModel):
|
|
|
766
767
|
config: Glm4vMoeVisionConfig
|
|
767
768
|
input_modalities = ("image", "video")
|
|
768
769
|
_no_split_modules = ["Glm4vMoeVisionBlock"]
|
|
770
|
+
_can_record_outputs = {
|
|
771
|
+
"hidden_states": Glm4vMoeVisionBlock,
|
|
772
|
+
"attentions": Glm4vMoeVisionAttention,
|
|
773
|
+
}
|
|
769
774
|
|
|
770
775
|
def __init__(self, config) -> None:
|
|
771
776
|
super().__init__(config)
|
|
@@ -824,13 +829,16 @@ class Glm4vMoeVisionModel(Glm4vMoePreTrainedModel):
|
|
|
824
829
|
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
|
|
825
830
|
return rotary_pos_emb, pos_ids
|
|
826
831
|
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
832
|
+
@check_model_inputs
|
|
833
|
+
@auto_docstring
|
|
834
|
+
def forward(
|
|
835
|
+
self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
|
|
836
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
837
|
+
r"""
|
|
838
|
+
hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
|
|
839
|
+
The final hidden states of the model.
|
|
840
|
+
grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
|
|
841
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
834
842
|
|
|
835
843
|
Returns:
|
|
836
844
|
`torch.Tensor`: hidden_states.
|
|
@@ -864,6 +872,7 @@ class Glm4vMoeVisionModel(Glm4vMoePreTrainedModel):
|
|
|
864
872
|
hidden_states,
|
|
865
873
|
cu_seqlens=cu_seqlens,
|
|
866
874
|
position_embeddings=position_embeddings,
|
|
875
|
+
**kwargs,
|
|
867
876
|
)
|
|
868
877
|
|
|
869
878
|
hidden_states = self.post_layernorm(hidden_states)
|
|
@@ -874,8 +883,12 @@ class Glm4vMoeVisionModel(Glm4vMoePreTrainedModel):
|
|
|
874
883
|
hidden_states = hidden_states.permute(0, 3, 1, 2)
|
|
875
884
|
hidden_states = self.downsample(hidden_states).view(-1, self.config.out_hidden_size)
|
|
876
885
|
|
|
877
|
-
|
|
878
|
-
|
|
886
|
+
merged_hidden_states = self.merger(hidden_states)
|
|
887
|
+
|
|
888
|
+
return BaseModelOutputWithPooling(
|
|
889
|
+
last_hidden_state=hidden_states,
|
|
890
|
+
pooler_output=merged_hidden_states,
|
|
891
|
+
)
|
|
879
892
|
|
|
880
893
|
|
|
881
894
|
class Glm4vMoeTextRotaryEmbedding(nn.Module):
|
|
@@ -988,7 +1001,7 @@ class Glm4vMoeTextModel(Glm4vMoePreTrainedModel):
|
|
|
988
1001
|
use_cache: bool | None = None,
|
|
989
1002
|
cache_position: torch.LongTensor | None = None,
|
|
990
1003
|
**kwargs: Unpack[FlashAttentionKwargs],
|
|
991
|
-
) -> MoeModelOutputWithPast:
|
|
1004
|
+
) -> tuple | MoeModelOutputWithPast:
|
|
992
1005
|
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
993
1006
|
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
|
994
1007
|
|
|
@@ -1302,17 +1315,19 @@ class Glm4vMoeModel(Glm4vMoePreTrainedModel):
|
|
|
1302
1315
|
|
|
1303
1316
|
return position_ids, mrope_position_deltas
|
|
1304
1317
|
|
|
1318
|
+
@can_return_tuple
|
|
1319
|
+
@auto_docstring
|
|
1305
1320
|
def get_video_features(
|
|
1306
|
-
self,
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1321
|
+
self,
|
|
1322
|
+
pixel_values_videos: torch.FloatTensor,
|
|
1323
|
+
video_grid_thw: torch.LongTensor | None = None,
|
|
1324
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
1325
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
1326
|
+
r"""
|
|
1327
|
+
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
|
1328
|
+
The tensors corresponding to the input videos.
|
|
1329
|
+
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
|
|
1330
|
+
The temporal, height and width of feature shape of each video in LLM.
|
|
1316
1331
|
"""
|
|
1317
1332
|
pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
|
|
1318
1333
|
# reshape video_grid_thw -> [b, 3] -> [1, h, w] * frames
|
|
@@ -1321,26 +1336,36 @@ class Glm4vMoeModel(Glm4vMoePreTrainedModel):
|
|
|
1321
1336
|
repeated_row = torch.tensor([1, h.item(), w.item()]).unsqueeze(0).repeat(t, 1)
|
|
1322
1337
|
temp_frames_hw.append(repeated_row)
|
|
1323
1338
|
flattened_video_grid_thw = torch.cat(temp_frames_hw, dim=0)
|
|
1324
|
-
|
|
1339
|
+
vision_outputs = self.visual(
|
|
1340
|
+
pixel_values_videos, grid_thw=flattened_video_grid_thw, return_dict=True, **kwargs
|
|
1341
|
+
)
|
|
1325
1342
|
split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
|
|
1326
|
-
video_embeds = torch.split(
|
|
1327
|
-
|
|
1343
|
+
video_embeds = torch.split(vision_outputs.pooler_output, split_sizes)
|
|
1344
|
+
vision_outputs.pooler_output = video_embeds
|
|
1328
1345
|
|
|
1329
|
-
|
|
1330
|
-
"""
|
|
1331
|
-
Encodes images into continuous embeddings that can be forwarded to the language model.
|
|
1346
|
+
return vision_outputs
|
|
1332
1347
|
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1348
|
+
@can_return_tuple
|
|
1349
|
+
@auto_docstring
|
|
1350
|
+
def get_image_features(
|
|
1351
|
+
self,
|
|
1352
|
+
pixel_values: torch.FloatTensor,
|
|
1353
|
+
image_grid_thw: torch.LongTensor | None = None,
|
|
1354
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
1355
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
1356
|
+
r"""
|
|
1357
|
+
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
|
1358
|
+
The tensors corresponding to the input images.
|
|
1359
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
1360
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
1338
1361
|
"""
|
|
1339
1362
|
pixel_values = pixel_values.type(self.visual.dtype)
|
|
1340
|
-
|
|
1363
|
+
vision_outputs = self.visual(pixel_values, grid_thw=image_grid_thw, return_dict=True, **kwargs)
|
|
1341
1364
|
split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
|
|
1342
|
-
image_embeds = torch.split(
|
|
1343
|
-
|
|
1365
|
+
image_embeds = torch.split(vision_outputs.pooler_output, split_sizes)
|
|
1366
|
+
vision_outputs.pooler_output = image_embeds
|
|
1367
|
+
|
|
1368
|
+
return vision_outputs
|
|
1344
1369
|
|
|
1345
1370
|
def get_placeholder_mask(
|
|
1346
1371
|
self,
|
|
@@ -1369,18 +1394,19 @@ class Glm4vMoeModel(Glm4vMoePreTrainedModel):
|
|
|
1369
1394
|
|
|
1370
1395
|
n_image_tokens = special_image_mask.sum()
|
|
1371
1396
|
special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
|
1372
|
-
if image_features is not None
|
|
1373
|
-
|
|
1374
|
-
|
|
1397
|
+
if image_features is not None:
|
|
1398
|
+
torch_compilable_check(
|
|
1399
|
+
inputs_embeds[special_image_mask].numel() == image_features.numel(),
|
|
1400
|
+
f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
|
|
1375
1401
|
)
|
|
1376
1402
|
|
|
1377
1403
|
n_video_tokens = special_video_mask.sum()
|
|
1378
1404
|
special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
|
1379
|
-
if video_features is not None
|
|
1380
|
-
|
|
1381
|
-
|
|
1405
|
+
if video_features is not None:
|
|
1406
|
+
torch_compilable_check(
|
|
1407
|
+
inputs_embeds[special_video_mask].numel() == video_features.numel(),
|
|
1408
|
+
f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
|
|
1382
1409
|
)
|
|
1383
|
-
|
|
1384
1410
|
return special_image_mask, special_video_mask
|
|
1385
1411
|
|
|
1386
1412
|
@auto_docstring
|
|
@@ -1415,13 +1441,13 @@ class Glm4vMoeModel(Glm4vMoePreTrainedModel):
|
|
|
1415
1441
|
inputs_embeds = self.get_input_embeddings()(input_ids)
|
|
1416
1442
|
|
|
1417
1443
|
if pixel_values is not None:
|
|
1418
|
-
image_embeds = self.get_image_features(pixel_values, image_grid_thw)
|
|
1444
|
+
image_embeds = self.get_image_features(pixel_values, image_grid_thw, return_dict=True).pooler_output
|
|
1419
1445
|
image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
|
|
1420
1446
|
image_mask, _ = self.get_placeholder_mask(input_ids, inputs_embeds, image_features=image_embeds)
|
|
1421
1447
|
inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
|
|
1422
1448
|
|
|
1423
1449
|
if pixel_values_videos is not None:
|
|
1424
|
-
video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
|
|
1450
|
+
video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw, return_dict=True).pooler_output
|
|
1425
1451
|
video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
|
|
1426
1452
|
_, video_mask = self.get_placeholder_mask(input_ids, inputs_embeds, video_features=video_embeds)
|
|
1427
1453
|
inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
|
|
@@ -1592,13 +1618,37 @@ class Glm4vMoeForConditionalGeneration(Glm4vMoePreTrainedModel, GenerationMixin)
|
|
|
1592
1618
|
def set_input_embeddings(self, value):
|
|
1593
1619
|
self.model.set_input_embeddings(value)
|
|
1594
1620
|
|
|
1621
|
+
@auto_docstring
|
|
1595
1622
|
def get_video_features(
|
|
1596
|
-
self,
|
|
1597
|
-
|
|
1598
|
-
|
|
1623
|
+
self,
|
|
1624
|
+
pixel_values_videos: torch.FloatTensor,
|
|
1625
|
+
video_grid_thw: torch.LongTensor | None = None,
|
|
1626
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
1627
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
1628
|
+
r"""
|
|
1629
|
+
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
|
1630
|
+
The tensors corresponding to the input videos.
|
|
1631
|
+
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
|
|
1632
|
+
The temporal, height and width of feature shape of each video in LLM.
|
|
1633
|
+
"""
|
|
1634
|
+
return self.model.get_video_features(
|
|
1635
|
+
pixel_values_videos=pixel_values_videos, video_grid_thw=video_grid_thw, **kwargs
|
|
1636
|
+
)
|
|
1599
1637
|
|
|
1600
|
-
|
|
1601
|
-
|
|
1638
|
+
@auto_docstring
|
|
1639
|
+
def get_image_features(
|
|
1640
|
+
self,
|
|
1641
|
+
pixel_values: torch.FloatTensor,
|
|
1642
|
+
image_grid_thw: torch.LongTensor | None = None,
|
|
1643
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
1644
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
1645
|
+
r"""
|
|
1646
|
+
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
|
1647
|
+
The tensors corresponding to the input images.
|
|
1648
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
1649
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
1650
|
+
"""
|
|
1651
|
+
return self.model.get_image_features(pixel_values=pixel_values, image_grid_thw=image_grid_thw, **kwargs)
|
|
1602
1652
|
|
|
1603
1653
|
@auto_docstring
|
|
1604
1654
|
@check_model_inputs
|
|
@@ -1632,23 +1682,25 @@ class Glm4vMoeForConditionalGeneration(Glm4vMoePreTrainedModel, GenerationMixin)
|
|
|
1632
1682
|
|
|
1633
1683
|
```python
|
|
1634
1684
|
>>> from PIL import Image
|
|
1635
|
-
>>> import
|
|
1685
|
+
>>> import httpx
|
|
1686
|
+
>>> from io import BytesIO
|
|
1636
1687
|
>>> from transformers import AutoProcessor, Glm4vMoeForConditionalGeneration
|
|
1637
1688
|
|
|
1638
|
-
>>> model = Glm4vMoeForConditionalGeneration.from_pretrained("
|
|
1639
|
-
>>> processor = AutoProcessor.from_pretrained("
|
|
1689
|
+
>>> model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.1V-9B-Thinking")
|
|
1690
|
+
>>> processor = AutoProcessor.from_pretrained("zai-org/GLM-4.1V-9B-Thinking")
|
|
1640
1691
|
|
|
1641
1692
|
>>> messages = [
|
|
1642
1693
|
{
|
|
1643
1694
|
"role": "user",
|
|
1644
1695
|
"content": [
|
|
1645
|
-
{"type": "image"},
|
|
1696
|
+
{"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
|
|
1646
1697
|
{"type": "text", "text": "What is shown in this image?"},
|
|
1647
1698
|
],
|
|
1648
1699
|
},
|
|
1649
1700
|
]
|
|
1650
1701
|
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
|
|
1651
|
-
>>>
|
|
1702
|
+
>>> with httpx.stream("GET", url) as response:
|
|
1703
|
+
... image = Image.open(BytesIO(response.read()))
|
|
1652
1704
|
|
|
1653
1705
|
>>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
1654
1706
|
>>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
|
|
@@ -1738,7 +1790,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vMoePreTrainedModel, GenerationMixin)
|
|
|
1738
1790
|
**kwargs,
|
|
1739
1791
|
)
|
|
1740
1792
|
|
|
1741
|
-
# GLM-
|
|
1793
|
+
# GLM-V position_ids are prepared with rope_deltas in forward
|
|
1742
1794
|
model_inputs["position_ids"] = None
|
|
1743
1795
|
|
|
1744
1796
|
if not is_first_iteration and use_cache:
|
|
@@ -22,7 +22,7 @@ from ...configuration_utils import PreTrainedConfig
|
|
|
22
22
|
from ...masking_utils import create_causal_mask
|
|
23
23
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
24
24
|
from ...modeling_outputs import MoeModelOutputWithPast
|
|
25
|
-
from ...modeling_rope_utils import RopeParameters
|
|
25
|
+
from ...modeling_rope_utils import RopeParameters
|
|
26
26
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
|
27
27
|
from ...processing_utils import Unpack
|
|
28
28
|
from ...utils import TransformersKwargs, auto_docstring, logging
|
|
@@ -55,7 +55,7 @@ from ..qwen3_vl_moe.modeling_qwen3_vl_moe import (
|
|
|
55
55
|
logger = logging.get_logger(__name__)
|
|
56
56
|
|
|
57
57
|
|
|
58
|
-
class Glm4vMoeTextConfig(Glm4MoeConfig
|
|
58
|
+
class Glm4vMoeTextConfig(Glm4MoeConfig):
|
|
59
59
|
r"""
|
|
60
60
|
This is the configuration class to store the configuration of a [`Glm4vMoeModel`]. It is used to instantiate a
|
|
61
61
|
GLM-4.5V model according to the specified arguments, defining the model architecture. Instantiating a
|
|
@@ -95,8 +95,6 @@ class Glm4vMoeTextConfig(Glm4MoeConfig, RotaryEmbeddingConfigMixin):
|
|
|
95
95
|
use_cache (`bool`, *optional*, defaults to `True`):
|
|
96
96
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
|
97
97
|
relevant if `config.is_decoder=True`.
|
|
98
|
-
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
99
|
-
Whether the model's input and output word embeddings should be tied.
|
|
100
98
|
rope_parameters (`RopeParameters`, *optional*):
|
|
101
99
|
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
|
|
102
100
|
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
|
|
@@ -124,8 +122,15 @@ class Glm4vMoeTextConfig(Glm4MoeConfig, RotaryEmbeddingConfigMixin):
|
|
|
124
122
|
\--k dense layers--/
|
|
125
123
|
norm_topk_prob (`bool`, *optional*, defaults to `True`):
|
|
126
124
|
Whether to normalize the topk probabilities.
|
|
125
|
+
pad_token_id (`int`, *optional*):
|
|
126
|
+
Padding token id.
|
|
127
|
+
eos_token_id (`int`, *optional*):
|
|
128
|
+
End of stream token id.
|
|
129
|
+
bos_token_id (`int`, *optional*):
|
|
130
|
+
Beginning of stream token id.
|
|
127
131
|
router_aux_loss_coef (`float`, *optional*, defaults to 0.0001):
|
|
128
132
|
The aux loss factor for the loss.
|
|
133
|
+
|
|
129
134
|
```python
|
|
130
135
|
>>> from transformers import Glm4vMoeTextModel, Glm4vMoeConfig
|
|
131
136
|
|
|
@@ -171,7 +176,6 @@ class Glm4vMoeTextConfig(Glm4MoeConfig, RotaryEmbeddingConfigMixin):
|
|
|
171
176
|
initializer_range: float | None = 0.02,
|
|
172
177
|
rms_norm_eps: int | None = 1e-5,
|
|
173
178
|
use_cache: bool | None = True,
|
|
174
|
-
tie_word_embeddings: bool | None = False,
|
|
175
179
|
rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
|
|
176
180
|
attention_bias: bool | None = True,
|
|
177
181
|
attention_dropout: float | None = 0.0,
|
|
@@ -184,9 +188,15 @@ class Glm4vMoeTextConfig(Glm4MoeConfig, RotaryEmbeddingConfigMixin):
|
|
|
184
188
|
topk_group: int | None = 1,
|
|
185
189
|
first_k_dense_replace: int | None = 1,
|
|
186
190
|
norm_topk_prob: bool | None = True,
|
|
191
|
+
pad_token_id: int | None = None,
|
|
192
|
+
eos_token_id: int | None = None,
|
|
193
|
+
bos_token_id: int | None = None,
|
|
187
194
|
router_aux_loss_coef: float | None = 0.0001,
|
|
188
195
|
**kwargs,
|
|
189
196
|
):
|
|
197
|
+
self.pad_token_id = pad_token_id
|
|
198
|
+
self.eos_token_id = eos_token_id
|
|
199
|
+
self.bos_token_id = bos_token_id
|
|
190
200
|
self.vocab_size = vocab_size
|
|
191
201
|
self.max_position_embeddings = max_position_embeddings
|
|
192
202
|
self.hidden_size = hidden_size
|
|
@@ -215,9 +225,7 @@ class Glm4vMoeTextConfig(Glm4MoeConfig, RotaryEmbeddingConfigMixin):
|
|
|
215
225
|
self.first_k_dense_replace = first_k_dense_replace
|
|
216
226
|
self.norm_topk_prob = norm_topk_prob
|
|
217
227
|
self.router_aux_loss_coef = router_aux_loss_coef
|
|
218
|
-
PreTrainedConfig.__init__(
|
|
219
|
-
self, tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
|
|
220
|
-
)
|
|
228
|
+
PreTrainedConfig.__init__(self, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs)
|
|
221
229
|
|
|
222
230
|
|
|
223
231
|
class Glm4vMoeConfig(Glm4vConfig):
|
|
@@ -248,6 +256,8 @@ class Glm4vMoeConfig(Glm4vConfig):
|
|
|
248
256
|
The video start token index to encode the start of video.
|
|
249
257
|
video_end_token_id (`int`, *optional*, defaults to 151342):
|
|
250
258
|
The video end token index to encode the end of video.
|
|
259
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
260
|
+
Whether the model's input and output word embeddings should be tied.
|
|
251
261
|
|
|
252
262
|
```python
|
|
253
263
|
>>> from transformers import Glm4vMoeForConditionalGeneration, Glm4vMoeConfig
|
|
@@ -272,6 +282,7 @@ class Glm4vMoeConfig(Glm4vConfig):
|
|
|
272
282
|
image_end_token_id=151340,
|
|
273
283
|
video_start_token_id=151341,
|
|
274
284
|
video_end_token_id=151342,
|
|
285
|
+
tie_word_embeddings=False,
|
|
275
286
|
**kwargs,
|
|
276
287
|
):
|
|
277
288
|
super().__init__()
|
|
@@ -310,9 +321,9 @@ class Glm4vMoeTextAttention(Glm4Attention):
|
|
|
310
321
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
|
311
322
|
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
|
312
323
|
|
|
313
|
-
attention_interface: Callable =
|
|
314
|
-
|
|
315
|
-
|
|
324
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
325
|
+
self.config._attn_implementation, eager_attention_forward
|
|
326
|
+
)
|
|
316
327
|
|
|
317
328
|
attn_output, attn_weights = attention_interface(
|
|
318
329
|
self,
|
|
@@ -404,7 +415,7 @@ class Glm4vMoeTextModel(Glm4vTextModel):
|
|
|
404
415
|
use_cache: bool | None = None,
|
|
405
416
|
cache_position: torch.LongTensor | None = None,
|
|
406
417
|
**kwargs: Unpack[FlashAttentionKwargs],
|
|
407
|
-
) -> MoeModelOutputWithPast:
|
|
418
|
+
) -> tuple | MoeModelOutputWithPast:
|
|
408
419
|
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
409
420
|
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
|
410
421
|
|
|
@@ -150,8 +150,8 @@ class GlmImageTextConfig(PreTrainedConfig):
|
|
|
150
150
|
|
|
151
151
|
Args:
|
|
152
152
|
vocab_size (`int`, *optional*, defaults to 168064):
|
|
153
|
-
Vocabulary size of the GlmImage model. Defines the number of different tokens that can be represented by
|
|
154
|
-
`inputs_ids` passed when calling [`GlmImageModel`]
|
|
153
|
+
Vocabulary size of the GlmImage model. Defines the number of different tokens that can be represented by
|
|
154
|
+
the `inputs_ids` passed when calling [`GlmImageModel`]
|
|
155
155
|
hidden_size (`int`, *optional*, defaults to 4096):
|
|
156
156
|
Dimension of the hidden representations.
|
|
157
157
|
intermediate_size (`int`, *optional*, defaults to 13696):
|
|
@@ -169,7 +169,7 @@ class GlmImageTextConfig(PreTrainedConfig):
|
|
|
169
169
|
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
|
|
170
170
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
|
171
171
|
The non-linear activation function (function or string) in the decoder.
|
|
172
|
-
max_position_embeddings (`int`, *optional*, defaults to
|
|
172
|
+
max_position_embeddings (`int`, *optional*, defaults to 131072):
|
|
173
173
|
The maximum sequence length that this model might ever be used with.
|
|
174
174
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
175
175
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
@@ -178,19 +178,21 @@ class GlmImageTextConfig(PreTrainedConfig):
|
|
|
178
178
|
use_cache (`bool`, *optional*, defaults to `True`):
|
|
179
179
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
|
180
180
|
relevant if `config.is_decoder=True`.
|
|
181
|
-
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
182
|
-
Whether the model's input and output word embeddings should be tied.
|
|
183
181
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
184
182
|
The dropout ratio for the attention probabilities.
|
|
185
183
|
rope_parameters (`RopeParameters`, *optional*):
|
|
186
184
|
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
|
|
187
185
|
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
|
|
188
186
|
with longer `max_position_embeddings`.
|
|
187
|
+
pad_token_id (`int`, *optional*, defaults to 167841):
|
|
188
|
+
The id of the padding token.
|
|
189
189
|
vision_vocab_size (`int`, *optional*, defaults to 16512):
|
|
190
|
-
Vision vocabulary size of the GlmImage model. Defines the number of different tokens that can be
|
|
191
|
-
by the `inputs_ids` passed when calling [`GlmImageVisionModel`]
|
|
190
|
+
Vision vocabulary size of the GlmImage model. Defines the number of different tokens that can be
|
|
191
|
+
represented by the `inputs_ids` passed when calling [`GlmImageVisionModel`]
|
|
192
192
|
attention_bias (`bool`, *optional*, defaults to `True`):
|
|
193
193
|
Whether to add a bias to the queries, keys and values.
|
|
194
|
+
eos_token_id (`int`, *optional*, defaults to 16385):
|
|
195
|
+
The id of the end of sequence token.
|
|
194
196
|
|
|
195
197
|
```python
|
|
196
198
|
>>> from transformers import GlmImageTextModel, GlmImageConfig
|
|
@@ -214,8 +216,8 @@ class GlmImageTextConfig(PreTrainedConfig):
|
|
|
214
216
|
"layers.*.self_attn.k_proj": "colwise",
|
|
215
217
|
"layers.*.self_attn.v_proj": "colwise",
|
|
216
218
|
"layers.*.self_attn.o_proj": "rowwise",
|
|
217
|
-
"layers.*.mlp.gate_up_proj": "
|
|
218
|
-
"layers.*.mlp.down_proj": "
|
|
219
|
+
"layers.*.mlp.gate_up_proj": "colwise_gather_output", # we need to replicate here due to the `chunk` operation
|
|
220
|
+
"layers.*.mlp.down_proj": "rowwise_split_input", # input is replicated due to the `chunk` operation
|
|
219
221
|
}
|
|
220
222
|
base_model_pp_plan = {
|
|
221
223
|
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
|
@@ -225,27 +227,25 @@ class GlmImageTextConfig(PreTrainedConfig):
|
|
|
225
227
|
|
|
226
228
|
def __init__(
|
|
227
229
|
self,
|
|
228
|
-
vocab_size: int
|
|
230
|
+
vocab_size: int = 168064,
|
|
229
231
|
hidden_size: int | None = 4096,
|
|
230
232
|
intermediate_size: int | None = 13696,
|
|
231
233
|
num_hidden_layers: int | None = 40,
|
|
232
234
|
num_attention_heads: int | None = 32,
|
|
233
235
|
num_key_value_heads: int | None = 2,
|
|
234
236
|
hidden_act: str | None = "silu",
|
|
235
|
-
max_position_embeddings: int
|
|
237
|
+
max_position_embeddings: int = 131072,
|
|
236
238
|
initializer_range: float | None = 0.02,
|
|
237
239
|
rms_norm_eps: int | None = 1e-05,
|
|
238
240
|
use_cache: bool | None = True,
|
|
239
|
-
tie_word_embeddings: bool | None = False,
|
|
240
241
|
attention_dropout: float | None = 0.0,
|
|
241
242
|
rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
|
|
242
|
-
|
|
243
|
-
|
|
243
|
+
pad_token_id: int = 167841,
|
|
244
|
+
vision_vocab_size: int = 16512,
|
|
245
|
+
attention_bias: bool = True,
|
|
246
|
+
eos_token_id: int = 16385,
|
|
244
247
|
**kwargs,
|
|
245
248
|
):
|
|
246
|
-
self.vocab_size = vocab_size
|
|
247
|
-
self.vision_vocab_size = vision_vocab_size
|
|
248
|
-
self.attention_bias = attention_bias
|
|
249
249
|
self.vocab_size = vocab_size
|
|
250
250
|
self.max_position_embeddings = max_position_embeddings
|
|
251
251
|
self.hidden_size = hidden_size
|
|
@@ -264,10 +264,12 @@ class GlmImageTextConfig(PreTrainedConfig):
|
|
|
264
264
|
self.use_cache = use_cache
|
|
265
265
|
self.attention_dropout = attention_dropout
|
|
266
266
|
self.rope_parameters = rope_parameters
|
|
267
|
+
self.pad_token_id = pad_token_id
|
|
267
268
|
|
|
268
|
-
super().__init__(
|
|
269
|
-
|
|
270
|
-
|
|
269
|
+
super().__init__(ignore_keys_at_rope_validation={"mrope_section"}, **kwargs)
|
|
270
|
+
self.vision_vocab_size = vision_vocab_size
|
|
271
|
+
self.attention_bias = attention_bias
|
|
272
|
+
self.eos_token_id = eos_token_id
|
|
271
273
|
|
|
272
274
|
|
|
273
275
|
class GlmImageConfig(PreTrainedConfig):
|
|
@@ -293,6 +295,8 @@ class GlmImageConfig(PreTrainedConfig):
|
|
|
293
295
|
The image start token index to encode the start of image.
|
|
294
296
|
image_end_token_id (`int`, *optional*, defaults to 16385):
|
|
295
297
|
The image end token index to encode the end of image.
|
|
298
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
299
|
+
Whether the model's input and output word embeddings should be tied.
|
|
296
300
|
|
|
297
301
|
```python
|
|
298
302
|
>>> from transformers import Glm4vForConditionalGeneration, Glm4vConfig
|
|
@@ -323,6 +327,7 @@ class GlmImageConfig(PreTrainedConfig):
|
|
|
323
327
|
image_token_id=167855,
|
|
324
328
|
image_start_token_id=16384,
|
|
325
329
|
image_end_token_id=16385,
|
|
330
|
+
tie_word_embeddings: bool | None = False,
|
|
326
331
|
**kwargs,
|
|
327
332
|
):
|
|
328
333
|
if isinstance(vision_config, dict):
|
|
@@ -346,6 +351,7 @@ class GlmImageConfig(PreTrainedConfig):
|
|
|
346
351
|
self.text_config = text_config
|
|
347
352
|
self.vision_config = vision_config
|
|
348
353
|
self.vq_config = vq_config
|
|
354
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
349
355
|
super().__init__(**kwargs)
|
|
350
356
|
|
|
351
357
|
|
|
@@ -144,7 +144,7 @@ class GlmImageImageProcessor(BaseImageProcessor):
|
|
|
144
144
|
The merge size of the vision encoder to llm encoder.
|
|
145
145
|
"""
|
|
146
146
|
|
|
147
|
-
model_input_names = ["pixel_values", "image_grid_thw"]
|
|
147
|
+
model_input_names = ["pixel_values", "image_grid_thw", "images_per_sample"]
|
|
148
148
|
valid_kwargs = GlmImageImageProcessorKwargs
|
|
149
149
|
|
|
150
150
|
def __init__(
|