transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -25,11 +25,13 @@ from ...configuration_utils import PreTrainedConfig
|
|
|
25
25
|
from ...feature_extraction_utils import BatchFeature
|
|
26
26
|
from ...generation import GenerationMixin
|
|
27
27
|
from ...image_utils import ImageInput
|
|
28
|
+
from ...modeling_outputs import BaseModelOutputWithPooling
|
|
28
29
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
29
30
|
from ...processing_utils import ImagesKwargs, ProcessorMixin, Unpack
|
|
30
31
|
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
|
31
|
-
from ...utils import TransformersKwargs, is_torch_available, logging
|
|
32
|
-
from
|
|
32
|
+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available, logging
|
|
33
|
+
from ...utils.generic import check_model_inputs
|
|
34
|
+
from ..chameleon.modeling_chameleon import ChameleonVQVAE, ChameleonVQVAEModelOutput, ChameleonVQVAEVectorQuantizer
|
|
33
35
|
from ..glm4v.configuration_glm4v import Glm4vTextConfig, Glm4vVisionConfig
|
|
34
36
|
from ..glm4v.modeling_glm4v import (
|
|
35
37
|
Glm4vCausalLMOutputWithPast,
|
|
@@ -174,8 +176,8 @@ class GlmImageTextConfig(Glm4vTextConfig):
|
|
|
174
176
|
|
|
175
177
|
Args:
|
|
176
178
|
vocab_size (`int`, *optional*, defaults to 168064):
|
|
177
|
-
Vocabulary size of the GlmImage model. Defines the number of different tokens that can be represented by
|
|
178
|
-
`inputs_ids` passed when calling [`GlmImageModel`]
|
|
179
|
+
Vocabulary size of the GlmImage model. Defines the number of different tokens that can be represented by
|
|
180
|
+
the `inputs_ids` passed when calling [`GlmImageModel`]
|
|
179
181
|
hidden_size (`int`, *optional*, defaults to 4096):
|
|
180
182
|
Dimension of the hidden representations.
|
|
181
183
|
intermediate_size (`int`, *optional*, defaults to 13696):
|
|
@@ -193,7 +195,7 @@ class GlmImageTextConfig(Glm4vTextConfig):
|
|
|
193
195
|
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
|
|
194
196
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
|
195
197
|
The non-linear activation function (function or string) in the decoder.
|
|
196
|
-
max_position_embeddings (`int`, *optional*, defaults to
|
|
198
|
+
max_position_embeddings (`int`, *optional*, defaults to 131072):
|
|
197
199
|
The maximum sequence length that this model might ever be used with.
|
|
198
200
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
199
201
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
@@ -202,19 +204,21 @@ class GlmImageTextConfig(Glm4vTextConfig):
|
|
|
202
204
|
use_cache (`bool`, *optional*, defaults to `True`):
|
|
203
205
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
|
204
206
|
relevant if `config.is_decoder=True`.
|
|
205
|
-
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
206
|
-
Whether the model's input and output word embeddings should be tied.
|
|
207
207
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
208
208
|
The dropout ratio for the attention probabilities.
|
|
209
209
|
rope_parameters (`RopeParameters`, *optional*):
|
|
210
210
|
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
|
|
211
211
|
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
|
|
212
212
|
with longer `max_position_embeddings`.
|
|
213
|
+
pad_token_id (`int`, *optional*, defaults to 167841):
|
|
214
|
+
The id of the padding token.
|
|
213
215
|
vision_vocab_size (`int`, *optional*, defaults to 16512):
|
|
214
|
-
Vision vocabulary size of the GlmImage model. Defines the number of different tokens that can be
|
|
215
|
-
by the `inputs_ids` passed when calling [`GlmImageVisionModel`]
|
|
216
|
+
Vision vocabulary size of the GlmImage model. Defines the number of different tokens that can be
|
|
217
|
+
represented by the `inputs_ids` passed when calling [`GlmImageVisionModel`]
|
|
216
218
|
attention_bias (`bool`, *optional*, defaults to `True`):
|
|
217
219
|
Whether to add a bias to the queries, keys and values.
|
|
220
|
+
eos_token_id (`int`, *optional*, defaults to 16385):
|
|
221
|
+
The id of the end of sequence token.
|
|
218
222
|
|
|
219
223
|
```python
|
|
220
224
|
>>> from transformers import GlmImageTextModel, GlmImageConfig
|
|
@@ -231,18 +235,23 @@ class GlmImageTextConfig(Glm4vTextConfig):
|
|
|
231
235
|
|
|
232
236
|
def __init__(
|
|
233
237
|
self,
|
|
234
|
-
vocab_size: int
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
+
vocab_size: int = 168064,
|
|
239
|
+
max_position_embeddings: int = 131072,
|
|
240
|
+
vision_vocab_size: int = 16512,
|
|
241
|
+
attention_bias: bool = True,
|
|
242
|
+
pad_token_id: int = 167841,
|
|
243
|
+
eos_token_id: int = 16385,
|
|
238
244
|
**super_kwargs,
|
|
239
245
|
):
|
|
240
|
-
self.vocab_size = vocab_size
|
|
241
|
-
self.vision_vocab_size = vision_vocab_size
|
|
242
|
-
self.attention_bias = attention_bias
|
|
243
246
|
super().__init__(
|
|
244
|
-
|
|
247
|
+
vocab_size=vocab_size,
|
|
248
|
+
max_position_embeddings=max_position_embeddings,
|
|
249
|
+
pad_token_id=pad_token_id,
|
|
250
|
+
**super_kwargs,
|
|
245
251
|
)
|
|
252
|
+
self.vision_vocab_size = vision_vocab_size
|
|
253
|
+
self.attention_bias = attention_bias
|
|
254
|
+
self.eos_token_id = eos_token_id
|
|
246
255
|
|
|
247
256
|
|
|
248
257
|
class GlmImageConfig(PreTrainedConfig):
|
|
@@ -268,6 +277,8 @@ class GlmImageConfig(PreTrainedConfig):
|
|
|
268
277
|
The image start token index to encode the start of image.
|
|
269
278
|
image_end_token_id (`int`, *optional*, defaults to 16385):
|
|
270
279
|
The image end token index to encode the end of image.
|
|
280
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
281
|
+
Whether the model's input and output word embeddings should be tied.
|
|
271
282
|
|
|
272
283
|
```python
|
|
273
284
|
>>> from transformers import Glm4vForConditionalGeneration, Glm4vConfig
|
|
@@ -298,6 +309,7 @@ class GlmImageConfig(PreTrainedConfig):
|
|
|
298
309
|
image_token_id=167855,
|
|
299
310
|
image_start_token_id=16384,
|
|
300
311
|
image_end_token_id=16385,
|
|
312
|
+
tie_word_embeddings: bool | None = False,
|
|
301
313
|
**kwargs,
|
|
302
314
|
):
|
|
303
315
|
if isinstance(vision_config, dict):
|
|
@@ -321,6 +333,7 @@ class GlmImageConfig(PreTrainedConfig):
|
|
|
321
333
|
self.text_config = text_config
|
|
322
334
|
self.vision_config = vision_config
|
|
323
335
|
self.vq_config = vq_config
|
|
336
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
324
337
|
super().__init__(**kwargs)
|
|
325
338
|
|
|
326
339
|
|
|
@@ -348,9 +361,9 @@ class GlmImageVisionAttention(Glm4vVisionAttention):
|
|
|
348
361
|
key_states = key_states.transpose(0, 1).unsqueeze(0)
|
|
349
362
|
value_states = value_states.transpose(0, 1).unsqueeze(0)
|
|
350
363
|
|
|
351
|
-
attention_interface: Callable =
|
|
352
|
-
|
|
353
|
-
|
|
364
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
365
|
+
self.config._attn_implementation, eager_attention_forward
|
|
366
|
+
)
|
|
354
367
|
|
|
355
368
|
if "flash" in self.config._attn_implementation:
|
|
356
369
|
# Flash Attention: Use cu_seqlens for variable length attention
|
|
@@ -516,19 +529,29 @@ class GlmImageVQVAEVectorQuantizer(ChameleonVQVAEVectorQuantizer):
|
|
|
516
529
|
return hidden_state_quant, loss, min_encoding_indices
|
|
517
530
|
|
|
518
531
|
|
|
532
|
+
class GlmImageVQVAEModelOutput(ChameleonVQVAEModelOutput):
|
|
533
|
+
pass
|
|
534
|
+
|
|
535
|
+
|
|
519
536
|
class GlmImageVQVAE(ChameleonVQVAE):
|
|
520
537
|
_no_split_modules = [
|
|
521
538
|
"GlmImageVQVAEVectorQuantizer",
|
|
522
539
|
]
|
|
540
|
+
_can_record_outputs = {}
|
|
523
541
|
|
|
524
542
|
def __init__(self, config: GlmImageVQVAEConfig):
|
|
525
543
|
super().__init__(config)
|
|
526
544
|
del self.encoder
|
|
527
545
|
|
|
528
546
|
def encode(self, hidden_states):
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
return
|
|
547
|
+
conv_hidden_states = self.quant_conv(hidden_states)
|
|
548
|
+
quantized_last_hidden_state, emb_loss, indices = self.quantize(conv_hidden_states)
|
|
549
|
+
return GlmImageVQVAEModelOutput(
|
|
550
|
+
last_hidden_state=hidden_states,
|
|
551
|
+
quantized_last_hidden_state=quantized_last_hidden_state,
|
|
552
|
+
image_tokens=indices,
|
|
553
|
+
embedding_loss=emb_loss,
|
|
554
|
+
)
|
|
532
555
|
|
|
533
556
|
|
|
534
557
|
class GlmImageVisionModel(Glm4vVisionModel):
|
|
@@ -574,13 +597,16 @@ class GlmImageVisionModel(Glm4vVisionModel):
|
|
|
574
597
|
pos_ids = torch.cat(pos_ids, dim=0)
|
|
575
598
|
return pos_ids
|
|
576
599
|
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
600
|
+
@check_model_inputs
|
|
601
|
+
@auto_docstring
|
|
602
|
+
def forward(
|
|
603
|
+
self, pixel_values: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
|
|
604
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
605
|
+
r"""
|
|
606
|
+
pixel_values (`torch.Tensor` of shape `(total_patches, num_channels * patch_size * patch_size)`):
|
|
607
|
+
Packed pixel values.
|
|
608
|
+
grid_thw (`torch.Tensor` of shape `(num_images, 3)`):
|
|
609
|
+
The temporal, height and width of feature shape of each image.
|
|
584
610
|
|
|
585
611
|
Returns:
|
|
586
612
|
`torch.Tensor` of shape `(total_patches, hidden_size)`: Hidden states.
|
|
@@ -609,7 +635,8 @@ class GlmImageVisionModel(Glm4vVisionModel):
|
|
|
609
635
|
hidden_states,
|
|
610
636
|
cu_seqlens=cu_seqlens,
|
|
611
637
|
)
|
|
612
|
-
|
|
638
|
+
|
|
639
|
+
return BaseModelOutputWithPooling(last_hidden_state=hidden_states)
|
|
613
640
|
|
|
614
641
|
|
|
615
642
|
class GlmImageTextModel(Glm4vTextModel):
|
|
@@ -625,6 +652,10 @@ class GlmImageModel(Glm4vModel):
|
|
|
625
652
|
|
|
626
653
|
self.rope_deltas = None # cache rope_deltas here
|
|
627
654
|
|
|
655
|
+
# Per-sample caches for batch processing
|
|
656
|
+
self._cached_decode_position_ids = None # shape: [batch_size, 3, max_decode_len]
|
|
657
|
+
self._prefill_len = None # prefill sequence length (same for all samples in batch)
|
|
658
|
+
|
|
628
659
|
# Initialize weights and apply final processing
|
|
629
660
|
self.post_init()
|
|
630
661
|
|
|
@@ -632,220 +663,169 @@ class GlmImageModel(Glm4vModel):
|
|
|
632
663
|
self,
|
|
633
664
|
input_ids: torch.LongTensor | None = None,
|
|
634
665
|
image_grid_thw: torch.LongTensor | None = None,
|
|
666
|
+
images_per_sample: torch.LongTensor | None = None,
|
|
635
667
|
attention_mask: torch.LongTensor | None = None,
|
|
636
668
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
|
637
669
|
"""
|
|
638
|
-
Calculate the 3D rope index for image generation task.
|
|
639
|
-
|
|
640
|
-
Explanation:
|
|
641
|
-
Each embedding sequence may contain image tokens (for generation) and text tokens,
|
|
642
|
-
or just text tokens.
|
|
643
|
-
|
|
644
|
-
Input format:
|
|
645
|
-
- Text-to-Image: [text tokens] + <|dit_token_16384|>
|
|
646
|
-
- Image-to-Image: <|dit_token_16384|> [image tokens] <|dit_token_16385|> + [text tokens] + <|dit_token_16384|>
|
|
647
|
-
|
|
648
|
-
For pure text embedding sequence, the rotary position embedding is the same across all 3 dimensions.
|
|
649
|
-
Examples:
|
|
650
|
-
input_ids: [T T T T T], here T is for text.
|
|
651
|
-
temporal position_ids: [0, 1, 2, 3, 4]
|
|
652
|
-
height position_ids: [0, 1, 2, 3, 4]
|
|
653
|
-
width position_ids: [0, 1, 2, 3, 4]
|
|
654
|
-
|
|
655
|
-
For sequences with image tokens, we use special markers to denote image regions:
|
|
656
|
-
- <|dit_token_16384|>: image start marker
|
|
657
|
-
- <|dit_token_16385|>: image end marker
|
|
658
|
-
- Image tokens between these markers use 2D spatial position encoding.
|
|
659
|
-
|
|
660
|
-
For image tokens:
|
|
661
|
-
- temporal: stays constant at (image_start_pos + 1)
|
|
662
|
-
- height: increments every w tokens, representing row position
|
|
663
|
-
- width: cycles from 0 to w-1, representing column position
|
|
664
|
-
|
|
665
|
-
After each image region, the next position jumps to: image_start_pos + 1 + max(h, w)
|
|
666
|
-
This ensures sufficient positional separation between images and subsequent tokens.
|
|
667
|
-
|
|
668
|
-
Examples:
|
|
669
|
-
=== Case 1: Image-to-Image Generation ===
|
|
670
|
-
|
|
671
|
-
Source image with grid [1, 3, 2], followed by text, then generation.
|
|
672
|
-
input_ids: [<|dit_token_16384|> V V V V V V <|dit_token_16385|> T T T T <|dit_token_16384|>]
|
|
673
|
-
image_grid_thw: [[1, 3, 2], [1, 4, 4]] # first is source, second is target
|
|
674
|
-
|
|
675
|
-
For source image (h=3, w=2, 6 tokens):
|
|
676
|
-
Start marker at position 0
|
|
677
|
-
Image tokens at temporal=1, height=[1,1,2,2,3,3], width=[1,2,1,2,1,2]
|
|
678
|
-
End marker at position 4 (= 0 + 1 + max(3,2))
|
|
679
|
-
|
|
680
|
-
Text tokens and trailing start marker continue from position 5.
|
|
681
|
-
|
|
682
|
-
Full prefill position_ids:
|
|
683
|
-
temporal: [0, 1,1,1,1,1,1, 4, 5,6,7,8, 9]
|
|
684
|
-
height: [0, 1,1,2,2,3,3, 4, 5,6,7,8, 9]
|
|
685
|
-
width: [0, 1,2,1,2,1,2, 4, 5,6,7,8, 9]
|
|
686
|
-
|
|
687
|
-
Decode stage: use image_grid_thw[-1] = [1, 4, 4] to build cached position_ids,
|
|
688
|
-
starting from gen_st_idx = 10.
|
|
689
|
-
|
|
690
|
-
=== Case 2: Text-to-Image Generation (multi-resolution) ===
|
|
691
|
-
|
|
692
|
-
Pure text input with two image_grids for progressive generation.
|
|
693
|
-
input_ids: [hello<sop>3 3<eop><sop>3 2<eop><|dit_token_16384|>]
|
|
694
|
-
Assume "hello<sop>3 3<eop><sop>3 2<eop>" = 4 tokens (positions 0-3)
|
|
695
|
-
<|dit_token_16384|> at position 4
|
|
696
|
-
image_grid_thw: [[1, 3, 3], [1, 3, 2]]
|
|
697
|
-
- image_grid_thw[-1] = [1, 3, 2]: first generated image (smaller/draft)
|
|
698
|
-
- image_grid_thw[-2] = [1, 3, 3]: second generated image (larger/final)
|
|
699
|
-
|
|
700
|
-
Prefill position_ids (5 tokens: 4 text + 1 start marker):
|
|
701
|
-
temporal: [0, 1, 2, 3, 4]
|
|
702
|
-
height: [0, 1, 2, 3, 4]
|
|
703
|
-
width: [0, 1, 2, 3, 4]
|
|
704
|
-
|
|
705
|
-
Decode stage builds position_ids in reverse order of image_grid_thw:
|
|
706
|
-
|
|
707
|
-
First: image_grid_thw[-1] = [1, 3, 2] (6 tokens), starting at position 5:
|
|
708
|
-
temporal: [5, 5, 5, 5, 5, 5]
|
|
709
|
-
height: [5, 5, 6, 6, 7, 7]
|
|
710
|
-
width: [5, 6, 5, 6, 5, 6]
|
|
711
|
-
next_pos = 5 + max(3, 2) = 8
|
|
712
|
-
|
|
713
|
-
Then: image_grid_thw[-2] = [1, 3, 3] (9 tokens), starting at position 8:
|
|
714
|
-
temporal: [8, 8, 8, 8, 8, 8, 8, 8, 8]
|
|
715
|
-
height: [8, 8, 8, 9, 9, 9, 10, 10, 10]
|
|
716
|
-
width: [8, 9, 10, 8, 9, 10, 8, 9, 10]
|
|
717
|
-
next_pos = 8 + max(3, 3) = 11
|
|
718
|
-
|
|
719
|
-
Finally: <|dit_token_16385|> end marker at position 11
|
|
720
|
-
|
|
721
|
-
Full sequence position_ids (prefill + decode):
|
|
722
|
-
temporal: [0,1,2,3, 4, 5,5,5,5,5,5, 8,8,8,8,8,8,8,8,8, 11]
|
|
723
|
-
height: [0,1,2,3, 4, 5,5,6,6,7,7, 8,8,8,9,9,9,10,10,10, 11]
|
|
724
|
-
width: [0,1,2,3, 4, 5,6,5,6,5,6, 8,9,10,8,9,10,8,9,10, 11]
|
|
725
|
-
|
|
726
|
-
_cached_decode_position_ids shape: [3, 6 + 9 + 1] = [3, 16]
|
|
727
|
-
(includes all generated image tokens + end marker)
|
|
670
|
+
Calculate the 3D rope index for image generation task with full batch support.
|
|
728
671
|
|
|
729
672
|
Args:
|
|
730
673
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
731
|
-
Indices of input sequence tokens in the vocabulary.
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
processed in reverse order (last grid first, second-to-last grid second, etc.)
|
|
674
|
+
Indices of input sequence tokens in the vocabulary.
|
|
675
|
+
image_grid_thw (`torch.LongTensor` of shape `(total_images_in_batch, 3)`, *optional*):
|
|
676
|
+
The temporal, height and width of feature shape of each image.
|
|
677
|
+
Images are packed across all samples in the batch.
|
|
678
|
+
images_per_sample (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
679
|
+
Number of images (including target grids) for each sample in the batch.
|
|
680
|
+
Used to split image_grid_thw by sample.
|
|
739
681
|
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
740
|
-
Mask to avoid performing attention on padding token indices.
|
|
741
|
-
- 1 for tokens that are **not masked**,
|
|
742
|
-
- 0 for tokens that are **masked**.
|
|
682
|
+
Mask to avoid performing attention on padding token indices.
|
|
743
683
|
|
|
744
684
|
Returns:
|
|
745
685
|
position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`):
|
|
746
686
|
Position IDs for temporal, height, and width dimensions.
|
|
747
687
|
mrope_position_deltas (`torch.Tensor` of shape `(batch_size, 1)`):
|
|
748
|
-
Position deltas for multi-modal rotary position embedding
|
|
688
|
+
Position deltas for multi-modal rotary position embedding.
|
|
749
689
|
"""
|
|
750
|
-
|
|
751
690
|
batch_size, seq_len = input_ids.shape
|
|
752
691
|
device = input_ids.device
|
|
753
692
|
dtype = input_ids.dtype
|
|
754
693
|
|
|
755
694
|
image_start_token_id = self.config.image_start_token_id
|
|
756
695
|
image_end_token_id = self.config.image_end_token_id
|
|
757
|
-
num_complete_images = (input_ids == image_end_token_id).sum().item()
|
|
758
696
|
|
|
759
|
-
position_ids = torch.ones(
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
697
|
+
position_ids = torch.ones(3, batch_size, seq_len, dtype=dtype, device=device)
|
|
698
|
+
text_positions = torch.arange(seq_len, device=device)[None, :].repeat(3, 1)
|
|
699
|
+
|
|
700
|
+
# Split image_grid_thw by sample if images_per_sample is provided
|
|
701
|
+
if image_grid_thw is not None and images_per_sample is not None:
|
|
702
|
+
grids_per_sample = torch.split(image_grid_thw, images_per_sample.tolist())
|
|
703
|
+
elif image_grid_thw is not None:
|
|
704
|
+
# Fallback: assume all grids belong to first sample (batch_size=1)
|
|
705
|
+
grids_per_sample = [image_grid_thw] * batch_size
|
|
706
|
+
else:
|
|
707
|
+
grids_per_sample = [None] * batch_size
|
|
708
|
+
|
|
709
|
+
# Per-sample caches for decode stage
|
|
710
|
+
all_decode_position_ids = []
|
|
711
|
+
|
|
763
712
|
for batch_idx in range(batch_size):
|
|
764
713
|
curr_input_ids = input_ids[batch_idx]
|
|
765
|
-
|
|
766
|
-
|
|
714
|
+
curr_grids = grids_per_sample[batch_idx]
|
|
715
|
+
|
|
716
|
+
if attention_mask is not None and attention_mask.shape[1] == seq_len:
|
|
717
|
+
valid_mask = attention_mask[batch_idx] == 1
|
|
718
|
+
curr_input_ids_valid = curr_input_ids[valid_mask]
|
|
719
|
+
else:
|
|
720
|
+
# attention_mask may have different length during assisted decoding
|
|
721
|
+
curr_input_ids_valid = curr_input_ids
|
|
722
|
+
valid_mask = None
|
|
723
|
+
|
|
724
|
+
# Find image boundaries in this sample
|
|
725
|
+
image_end_positions = torch.where(curr_input_ids_valid == image_end_token_id)[0]
|
|
726
|
+
image_start_positions = torch.where(curr_input_ids_valid == image_start_token_id)[0] + 1
|
|
727
|
+
num_complete_images = len(image_end_positions)
|
|
767
728
|
|
|
768
|
-
|
|
769
|
-
image_start = torch.where(curr_input_ids == image_start_token_id)[0] + 1
|
|
770
|
-
current_pos = 0 # track the current position value
|
|
729
|
+
current_pos = 0
|
|
771
730
|
prev_image_end = 0
|
|
772
731
|
curr_position_ids = []
|
|
773
|
-
for start, end, grid in zip(image_start, image_end, image_grid_thw):
|
|
774
|
-
_, num_width_grid, num_height_grid = grid
|
|
775
732
|
|
|
776
|
-
|
|
733
|
+
# Process complete images (source images in image-to-image task)
|
|
734
|
+
for img_idx, (start, end) in enumerate(zip(image_start_positions, image_end_positions)):
|
|
735
|
+
if curr_grids is None or img_idx >= len(curr_grids):
|
|
736
|
+
break
|
|
737
|
+
grid = curr_grids[img_idx]
|
|
738
|
+
# grid format is [temporal, height, width]
|
|
739
|
+
_, height, width = grid.tolist()
|
|
740
|
+
|
|
741
|
+
# Text tokens before this image
|
|
777
742
|
llm_pos_length = start - prev_image_end
|
|
778
|
-
llm_position_ids = text_positions[:, current_pos : current_pos + llm_pos_length].to(
|
|
779
|
-
device=input_ids.device
|
|
780
|
-
)
|
|
743
|
+
llm_position_ids = text_positions[:, current_pos : current_pos + llm_pos_length].to(device=device)
|
|
781
744
|
current_pos += llm_position_ids.shape[-1]
|
|
782
745
|
|
|
783
|
-
#
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
position_temporal = torch.full(
|
|
792
|
-
(image_seq_length,), current_pos, device=input_ids.device, dtype=torch.long
|
|
746
|
+
# Image tokens with 2D spatial encoding
|
|
747
|
+
# For an image with height H and width W:
|
|
748
|
+
# - position_width cycles [0, 1, ..., W-1] for each row, repeated H times
|
|
749
|
+
# - position_height stays constant per row, [0]*W, [1]*W, ..., [H-1]*W
|
|
750
|
+
image_seq_length = height * width
|
|
751
|
+
position_width = torch.arange(current_pos, current_pos + width, device=device).repeat(height)
|
|
752
|
+
position_height = torch.arange(current_pos, current_pos + height, device=device).repeat_interleave(
|
|
753
|
+
width
|
|
793
754
|
)
|
|
755
|
+
position_temporal = torch.full((image_seq_length,), current_pos, device=device, dtype=torch.long)
|
|
794
756
|
vision_position_ids = torch.stack([position_temporal, position_height, position_width], dim=0)
|
|
795
|
-
current_pos += max(
|
|
757
|
+
current_pos += max(height, width)
|
|
796
758
|
|
|
797
759
|
prev_image_end = end
|
|
798
760
|
curr_position_ids.append(torch.cat([llm_position_ids, vision_position_ids], dim=-1))
|
|
799
761
|
|
|
800
|
-
#
|
|
801
|
-
end_position = len(
|
|
802
|
-
llm_position_ids = text_positions[:, current_pos : current_pos + end_position].to(device=
|
|
762
|
+
# Remaining text tokens (including the final image_start token for generation)
|
|
763
|
+
end_position = len(curr_input_ids_valid) - prev_image_end
|
|
764
|
+
llm_position_ids = text_positions[:, current_pos : current_pos + end_position].to(device=device)
|
|
803
765
|
current_pos += llm_position_ids.shape[-1]
|
|
804
766
|
curr_position_ids.append(llm_position_ids)
|
|
767
|
+
|
|
768
|
+
# Concatenate all position ids for this sample
|
|
805
769
|
curr_position_ids = torch.cat(curr_position_ids, dim=-1)
|
|
806
|
-
|
|
807
|
-
|
|
770
|
+
|
|
771
|
+
# Store in the main position_ids tensor
|
|
772
|
+
if valid_mask is not None:
|
|
773
|
+
position_ids[:, batch_idx, valid_mask] = curr_position_ids
|
|
808
774
|
else:
|
|
809
|
-
position_ids[:, batch_idx, :] = curr_position_ids
|
|
775
|
+
position_ids[:, batch_idx, :] = curr_position_ids
|
|
776
|
+
|
|
777
|
+
# Build decode position ids for this sample
|
|
778
|
+
if curr_grids is not None and len(curr_grids) > 0:
|
|
779
|
+
num_decode_grids = len(curr_grids) - num_complete_images
|
|
780
|
+
num_decode_grids = max(num_decode_grids, 0)
|
|
781
|
+
decode_pos = current_pos
|
|
782
|
+
|
|
783
|
+
decode_temporal_list = []
|
|
784
|
+
decode_height_list = []
|
|
785
|
+
decode_width_list = []
|
|
810
786
|
|
|
811
|
-
|
|
812
|
-
|
|
787
|
+
for i in range(1, num_decode_grids + 1):
|
|
788
|
+
grid_idx = -i
|
|
789
|
+
h = curr_grids[grid_idx, 1].item()
|
|
790
|
+
w = curr_grids[grid_idx, 2].item()
|
|
791
|
+
total_tokens = h * w
|
|
792
|
+
|
|
793
|
+
h_indices = torch.arange(h, device=device).unsqueeze(1).expand(h, w).flatten()
|
|
794
|
+
w_indices = torch.arange(w, device=device).unsqueeze(0).expand(h, w).flatten()
|
|
795
|
+
|
|
796
|
+
decode_temporal_list.append(
|
|
797
|
+
torch.full((total_tokens,), decode_pos, device=device, dtype=torch.long)
|
|
798
|
+
)
|
|
799
|
+
decode_height_list.append(decode_pos + h_indices)
|
|
800
|
+
decode_width_list.append(decode_pos + w_indices)
|
|
801
|
+
decode_pos = decode_pos + max(h, w)
|
|
802
|
+
|
|
803
|
+
# End marker
|
|
804
|
+
decode_temporal_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
|
|
805
|
+
decode_height_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
|
|
806
|
+
decode_width_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
|
|
807
|
+
|
|
808
|
+
sample_decode_pos_ids = torch.stack(
|
|
809
|
+
[
|
|
810
|
+
torch.cat(decode_temporal_list, dim=0),
|
|
811
|
+
torch.cat(decode_height_list, dim=0),
|
|
812
|
+
torch.cat(decode_width_list, dim=0),
|
|
813
|
+
],
|
|
814
|
+
dim=0,
|
|
815
|
+
)
|
|
816
|
+
all_decode_position_ids.append(sample_decode_pos_ids)
|
|
817
|
+
|
|
818
|
+
# Store prefill length (same for all samples since input_ids is padded to same length)
|
|
813
819
|
self._prefill_len = seq_len
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
for i in range(1, num_decode_grids + 1):
|
|
824
|
-
grid_idx = -i
|
|
825
|
-
h = image_grid_thw[grid_idx, 1].item()
|
|
826
|
-
w = image_grid_thw[grid_idx, 2].item()
|
|
827
|
-
total_tokens = h * w
|
|
828
|
-
|
|
829
|
-
h_indices = torch.arange(h, device=device).unsqueeze(1).expand(h, w).flatten()
|
|
830
|
-
w_indices = torch.arange(w, device=device).unsqueeze(0).expand(h, w).flatten()
|
|
831
|
-
|
|
832
|
-
decode_temporal_list.append(torch.full((total_tokens,), decode_pos, device=device, dtype=torch.long))
|
|
833
|
-
decode_height_list.append(decode_pos + h_indices)
|
|
834
|
-
decode_width_list.append(decode_pos + w_indices)
|
|
835
|
-
decode_pos = decode_pos + max(h, w)
|
|
836
|
-
|
|
837
|
-
decode_temporal_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
|
|
838
|
-
decode_height_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
|
|
839
|
-
decode_width_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
|
|
840
|
-
|
|
841
|
-
self._cached_decode_position_ids = torch.stack(
|
|
842
|
-
[
|
|
843
|
-
torch.cat(decode_temporal_list, dim=0),
|
|
844
|
-
torch.cat(decode_height_list, dim=0),
|
|
845
|
-
torch.cat(decode_width_list, dim=0),
|
|
846
|
-
],
|
|
847
|
-
dim=0,
|
|
848
|
-
)
|
|
820
|
+
|
|
821
|
+
# Pad decode position ids to same length and stack
|
|
822
|
+
if all_decode_position_ids:
|
|
823
|
+
max_decode_len = max(x.shape[1] for x in all_decode_position_ids)
|
|
824
|
+
padded_decode_pos_ids = [
|
|
825
|
+
F.pad(pos_ids, (0, max_decode_len - pos_ids.shape[1]), mode="replicate")
|
|
826
|
+
for pos_ids in all_decode_position_ids
|
|
827
|
+
]
|
|
828
|
+
self._cached_decode_position_ids = torch.stack(padded_decode_pos_ids, dim=0) # [batch, 3, max_decode_len]
|
|
849
829
|
else:
|
|
850
830
|
self._cached_decode_position_ids = None
|
|
851
831
|
|
|
@@ -880,13 +860,35 @@ class GlmImageModel(Glm4vModel):
|
|
|
880
860
|
grid_t, grid_h, grid_w = image_grid_thw[i].tolist()
|
|
881
861
|
hs = hs.view(grid_t, grid_h, grid_w, hidden_size)
|
|
882
862
|
hs = hs.permute(0, 3, 1, 2).contiguous()
|
|
883
|
-
|
|
884
|
-
all_image_toks.append(
|
|
863
|
+
vqmodel_outputs: GlmImageVQVAEModelOutput = self.vqmodel.encode(hs)
|
|
864
|
+
all_image_toks.append(vqmodel_outputs.image_tokens)
|
|
885
865
|
return torch.cat(all_image_toks, dim=0)
|
|
886
866
|
|
|
887
867
|
def get_video_features(self):
|
|
888
868
|
raise AttributeError("Not needed for GlmImage")
|
|
889
869
|
|
|
870
|
+
@can_return_tuple
|
|
871
|
+
@auto_docstring
|
|
872
|
+
def get_image_features(
|
|
873
|
+
self,
|
|
874
|
+
pixel_values: torch.FloatTensor,
|
|
875
|
+
image_grid_thw: torch.LongTensor | None = None,
|
|
876
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
877
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
878
|
+
r"""
|
|
879
|
+
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
|
880
|
+
The tensors corresponding to the input images.
|
|
881
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
882
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
883
|
+
"""
|
|
884
|
+
pixel_values = pixel_values.type(self.visual.dtype)
|
|
885
|
+
vision_outputs = self.visual(pixel_values, grid_thw=image_grid_thw, return_dict=True, **kwargs)
|
|
886
|
+
split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
|
|
887
|
+
image_embeds = torch.split(vision_outputs.last_hidden_state, split_sizes)
|
|
888
|
+
vision_outputs.pooler_output = image_embeds
|
|
889
|
+
|
|
890
|
+
return vision_outputs
|
|
891
|
+
|
|
890
892
|
def get_placeholder_mask(
|
|
891
893
|
self,
|
|
892
894
|
input_ids: torch.LongTensor,
|
|
@@ -927,23 +929,63 @@ class GlmImageModel(Glm4vModel):
|
|
|
927
929
|
inputs_embeds: torch.FloatTensor | None = None,
|
|
928
930
|
pixel_values: torch.Tensor | None = None,
|
|
929
931
|
image_grid_thw: torch.LongTensor | None = None,
|
|
932
|
+
images_per_sample: torch.LongTensor | None = None,
|
|
930
933
|
rope_deltas: torch.LongTensor | None = None,
|
|
931
934
|
cache_position: torch.LongTensor | None = None,
|
|
932
935
|
**kwargs: Unpack[TransformersKwargs],
|
|
933
936
|
) -> tuple | GlmImageModelOutputWithPast:
|
|
934
937
|
r"""
|
|
935
|
-
image_grid_thw (`torch.LongTensor` of shape `(
|
|
938
|
+
image_grid_thw (`torch.LongTensor` of shape `(total_images_in_batch, 3)`, *optional*):
|
|
936
939
|
The temporal, height and width of feature shape of each image in LLM.
|
|
940
|
+
Images are packed across all samples in the batch.
|
|
941
|
+
images_per_sample (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
942
|
+
Number of images (including target grids) for each sample in the batch.
|
|
937
943
|
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
|
|
938
944
|
The rope index difference between sequence length and multimodal rope.
|
|
939
945
|
"""
|
|
940
946
|
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
941
947
|
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
|
942
948
|
|
|
949
|
+
batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
|
|
950
|
+
|
|
943
951
|
if pixel_values is not None:
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
952
|
+
# Process source images (image-to-image mode)
|
|
953
|
+
# Source images are identified by counting image_end_token_id in input_ids
|
|
954
|
+
# Note: We must exclude padding tokens since pad_token_id == image_end_token_id
|
|
955
|
+
if images_per_sample is not None:
|
|
956
|
+
grids_per_sample = torch.split(image_grid_thw, images_per_sample.tolist())
|
|
957
|
+
# Create mask for non-padding tokens (attention_mask=1 means non-padding)
|
|
958
|
+
# Handle 4D attention mask (from static cache) by extracting diagonal
|
|
959
|
+
if attention_mask is not None and attention_mask.ndim == 4:
|
|
960
|
+
non_pad_mask = torch.diagonal(attention_mask[:, 0], dim1=1, dim2=2)
|
|
961
|
+
if non_pad_mask.dtype.is_floating_point:
|
|
962
|
+
non_pad_mask = non_pad_mask / torch.finfo(non_pad_mask.dtype).min
|
|
963
|
+
non_pad_mask = (1.0 - non_pad_mask).int()
|
|
964
|
+
# Only keep columns matching input_ids length
|
|
965
|
+
non_pad_mask = non_pad_mask[:, -input_ids.shape[1] :]
|
|
966
|
+
else:
|
|
967
|
+
non_pad_mask = attention_mask if attention_mask is not None else torch.ones_like(input_ids)
|
|
968
|
+
|
|
969
|
+
source_grids_list = []
|
|
970
|
+
for sample_idx in range(batch_size):
|
|
971
|
+
is_image_end = input_ids[sample_idx] == self.config.image_end_token_id
|
|
972
|
+
is_non_pad = non_pad_mask[sample_idx] == 1
|
|
973
|
+
num_source = (is_image_end & is_non_pad).sum().item()
|
|
974
|
+
if num_source > 0:
|
|
975
|
+
source_grids_list.append(grids_per_sample[sample_idx][:num_source])
|
|
976
|
+
if len(source_grids_list) == 0:
|
|
977
|
+
raise ValueError(
|
|
978
|
+
"pixel_values provided but no source images found in input_ids. "
|
|
979
|
+
"Ensure input_ids contains image_end_token_id for each source image."
|
|
980
|
+
)
|
|
981
|
+
source_grids = torch.cat(source_grids_list, dim=0)
|
|
982
|
+
else:
|
|
983
|
+
# Fallback for batch_size=1: all but last grid are source images
|
|
984
|
+
source_grids = image_grid_thw[:-1]
|
|
985
|
+
|
|
986
|
+
image_features = self.get_image_features(pixel_values, source_grids, return_dict=True)
|
|
987
|
+
image_embeds = torch.cat(image_features.pooler_output, dim=0)
|
|
988
|
+
image_ids = self.get_image_tokens(image_embeds, source_grids)
|
|
947
989
|
image_ids = image_ids.view(-1).to(input_ids.device)
|
|
948
990
|
special_image_mask = self.get_placeholder_mask(input_ids, image_ids)
|
|
949
991
|
input_ids = input_ids.masked_scatter(special_image_mask, image_ids)
|
|
@@ -961,8 +1003,6 @@ class GlmImageModel(Glm4vModel):
|
|
|
961
1003
|
attention_mask_2d = (1.0 - attention_mask_2d).int()
|
|
962
1004
|
|
|
963
1005
|
# Calculate RoPE index once per generation in the pre-fill stage only.
|
|
964
|
-
# It is safe to assume that `length!=1` means we're in pre-fill because the
|
|
965
|
-
# model is used only by DiT pipeline without assisted decoding, etc. techniques
|
|
966
1006
|
is_prefill_stage = (input_ids is not None and input_ids.shape[1] != 1) or (
|
|
967
1007
|
inputs_embeds is not None and inputs_embeds.shape[1] != 1
|
|
968
1008
|
)
|
|
@@ -970,17 +1010,27 @@ class GlmImageModel(Glm4vModel):
|
|
|
970
1010
|
position_ids, rope_deltas = self.get_rope_index(
|
|
971
1011
|
input_ids,
|
|
972
1012
|
image_grid_thw,
|
|
1013
|
+
images_per_sample=images_per_sample,
|
|
973
1014
|
attention_mask=attention_mask_2d,
|
|
974
1015
|
)
|
|
975
1016
|
self.rope_deltas = rope_deltas
|
|
976
1017
|
# then use the prev pre-calculated rope-deltas to get the correct position ids
|
|
977
1018
|
else:
|
|
978
1019
|
batch_size, seq_length, _ = inputs_embeds.shape
|
|
979
|
-
#
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
1020
|
+
# Per-sample decode position lookup
|
|
1021
|
+
# _cached_decode_position_ids shape: [batch_size, 3, max_decode_len]
|
|
1022
|
+
if self._cached_decode_position_ids is not None:
|
|
1023
|
+
step = cache_position[0].item() - self._prefill_len
|
|
1024
|
+
# Get position ids for all samples at once, then transpose to [3, batch_size, seq_length]
|
|
1025
|
+
position_ids = self._cached_decode_position_ids[:, :, step : step + seq_length].permute(1, 0, 2)
|
|
1026
|
+
else:
|
|
1027
|
+
# Fallback for text-to-image or cases without cached decode positions
|
|
1028
|
+
# Use simple incremental positions
|
|
1029
|
+
start_pos = cache_position[0].item()
|
|
1030
|
+
position_ids = torch.arange(
|
|
1031
|
+
start_pos, start_pos + seq_length, device=inputs_embeds.device, dtype=torch.long
|
|
1032
|
+
)
|
|
1033
|
+
position_ids = position_ids.unsqueeze(0).repeat(3, batch_size, 1)
|
|
984
1034
|
|
|
985
1035
|
outputs = self.language_model(
|
|
986
1036
|
input_ids=None,
|
|
@@ -1021,8 +1071,20 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
|
|
|
1021
1071
|
# Initialize weights and apply final processing
|
|
1022
1072
|
self.post_init()
|
|
1023
1073
|
|
|
1024
|
-
|
|
1025
|
-
|
|
1074
|
+
@auto_docstring
|
|
1075
|
+
def get_image_features(
|
|
1076
|
+
self,
|
|
1077
|
+
pixel_values: torch.FloatTensor,
|
|
1078
|
+
image_grid_thw: torch.LongTensor | None = None,
|
|
1079
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
1080
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
1081
|
+
r"""
|
|
1082
|
+
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
|
1083
|
+
The tensors corresponding to the input images.
|
|
1084
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
1085
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
1086
|
+
"""
|
|
1087
|
+
return self.model.get_image_features(pixel_values, image_grid_thw, **kwargs)
|
|
1026
1088
|
|
|
1027
1089
|
def get_image_tokens(self, hidden_states: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None):
|
|
1028
1090
|
return self.model.get_image_tokens(hidden_states, image_grid_thw)
|
|
@@ -1037,6 +1099,7 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
|
|
|
1037
1099
|
labels: torch.LongTensor | None = None,
|
|
1038
1100
|
pixel_values: torch.Tensor | None = None,
|
|
1039
1101
|
image_grid_thw: torch.LongTensor | None = None,
|
|
1102
|
+
images_per_sample: torch.LongTensor | None = None,
|
|
1040
1103
|
cache_position: torch.LongTensor | None = None,
|
|
1041
1104
|
logits_to_keep: int | torch.Tensor = 0,
|
|
1042
1105
|
**kwargs: Unpack[TransformersKwargs],
|
|
@@ -1046,14 +1109,18 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
|
|
|
1046
1109
|
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
|
1047
1110
|
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
|
1048
1111
|
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
|
1049
|
-
image_grid_thw (`torch.LongTensor` of shape `(
|
|
1112
|
+
image_grid_thw (`torch.LongTensor` of shape `(total_images_in_batch, 3)`, *optional*):
|
|
1050
1113
|
The temporal, height and width of feature shape of each image in LLM.
|
|
1114
|
+
Images are packed across all samples in the batch.
|
|
1115
|
+
images_per_sample (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
1116
|
+
Number of images (including target grids) for each sample in the batch.
|
|
1051
1117
|
|
|
1052
1118
|
Example:
|
|
1053
1119
|
|
|
1054
1120
|
```python
|
|
1055
1121
|
>>> from PIL import Image
|
|
1056
|
-
>>> import
|
|
1122
|
+
>>> import httpx
|
|
1123
|
+
>>> from io import BytesIO
|
|
1057
1124
|
>>> from transformers import AutoProcessor, GlmImageForConditionalGeneration
|
|
1058
1125
|
|
|
1059
1126
|
>>> model = GlmImageForConditionalGeneration.from_pretrained("zai-org/GLM-Image")
|
|
@@ -1069,7 +1136,8 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
|
|
|
1069
1136
|
},
|
|
1070
1137
|
]
|
|
1071
1138
|
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
|
|
1072
|
-
>>>
|
|
1139
|
+
>>> with httpx.stream("GET", url) as response:
|
|
1140
|
+
... image = Image.open(BytesIO(response.read()))
|
|
1073
1141
|
|
|
1074
1142
|
>>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
1075
1143
|
>>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
|
|
@@ -1083,6 +1151,7 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
|
|
|
1083
1151
|
input_ids=input_ids,
|
|
1084
1152
|
pixel_values=pixel_values,
|
|
1085
1153
|
image_grid_thw=image_grid_thw,
|
|
1154
|
+
images_per_sample=images_per_sample,
|
|
1086
1155
|
position_ids=position_ids,
|
|
1087
1156
|
attention_mask=attention_mask,
|
|
1088
1157
|
past_key_values=past_key_values,
|
|
@@ -1121,6 +1190,7 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
|
|
|
1121
1190
|
use_cache=True,
|
|
1122
1191
|
pixel_values=None,
|
|
1123
1192
|
image_grid_thw=None,
|
|
1193
|
+
images_per_sample=None,
|
|
1124
1194
|
is_first_iteration=False,
|
|
1125
1195
|
**kwargs,
|
|
1126
1196
|
):
|
|
@@ -1139,6 +1209,7 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
|
|
|
1139
1209
|
)
|
|
1140
1210
|
|
|
1141
1211
|
model_inputs["position_ids"] = None
|
|
1212
|
+
model_inputs["images_per_sample"] = images_per_sample
|
|
1142
1213
|
|
|
1143
1214
|
if not is_first_iteration and use_cache:
|
|
1144
1215
|
model_inputs["pixel_values"] = None
|
|
@@ -1175,11 +1246,42 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
|
|
|
1175
1246
|
if expand_size == 1:
|
|
1176
1247
|
return input_ids, model_kwargs
|
|
1177
1248
|
|
|
1178
|
-
visual_keys = ["pixel_values", "image_grid_thw"]
|
|
1249
|
+
visual_keys = ["pixel_values", "image_grid_thw", "images_per_sample"]
|
|
1179
1250
|
|
|
1180
1251
|
def _expand_dict_for_generation_visual(dict_to_expand):
|
|
1181
1252
|
image_grid_thw = model_kwargs.get("image_grid_thw", None)
|
|
1182
|
-
|
|
1253
|
+
if image_grid_thw is None:
|
|
1254
|
+
return dict_to_expand
|
|
1255
|
+
|
|
1256
|
+
images_per_sample = model_kwargs.get("images_per_sample", None)
|
|
1257
|
+
|
|
1258
|
+
# Use images_per_sample if available
|
|
1259
|
+
if images_per_sample is not None:
|
|
1260
|
+
image_nums = images_per_sample.tolist()
|
|
1261
|
+
elif input_ids is not None:
|
|
1262
|
+
# Try to infer from image_grid_thw / batch_size
|
|
1263
|
+
batch_size = input_ids.shape[0]
|
|
1264
|
+
total_grids = image_grid_thw.shape[0]
|
|
1265
|
+
if total_grids % batch_size == 0:
|
|
1266
|
+
grids_per_sample = total_grids // batch_size
|
|
1267
|
+
image_nums = [grids_per_sample] * batch_size
|
|
1268
|
+
else:
|
|
1269
|
+
# Cannot evenly distribute grids - fall back to simple repeat_interleave
|
|
1270
|
+
# This handles test cases where image_grid_thw has (batch_size + 1) rows
|
|
1271
|
+
dict_to_expand["image_grid_thw"] = image_grid_thw.repeat_interleave(expand_size, dim=0)
|
|
1272
|
+
if dict_to_expand.get("pixel_values") is not None:
|
|
1273
|
+
dict_to_expand["pixel_values"] = dict_to_expand["pixel_values"].repeat_interleave(
|
|
1274
|
+
expand_size, dim=0
|
|
1275
|
+
)
|
|
1276
|
+
return dict_to_expand
|
|
1277
|
+
else:
|
|
1278
|
+
image_nums = self._get_image_nums(input_ids).tolist()
|
|
1279
|
+
|
|
1280
|
+
# Get source image counts per sample from image_end_token_id count
|
|
1281
|
+
source_image_nums = [
|
|
1282
|
+
(input_ids[batch_idx] == self.config.image_end_token_id).sum().item()
|
|
1283
|
+
for batch_idx in range(len(image_nums))
|
|
1284
|
+
]
|
|
1183
1285
|
|
|
1184
1286
|
def _repeat_interleave_samples(x, lengths, repeat_times):
|
|
1185
1287
|
samples = torch.split(x, lengths)
|
|
@@ -1189,21 +1291,31 @@ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin)
|
|
|
1189
1291
|
|
|
1190
1292
|
for key in dict_to_expand:
|
|
1191
1293
|
if key == "pixel_values":
|
|
1192
|
-
#
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1294
|
+
# Split images into samples based on source image counts
|
|
1295
|
+
if sum(source_image_nums) > 0:
|
|
1296
|
+
# Split grids by sample to compute pixel counts
|
|
1297
|
+
grids_per_sample = torch.split(image_grid_thw, image_nums)
|
|
1298
|
+
lengths = []
|
|
1299
|
+
for batch_idx, sample_grids in enumerate(grids_per_sample):
|
|
1300
|
+
num_source = source_image_nums[batch_idx]
|
|
1301
|
+
if num_source > 0:
|
|
1302
|
+
source_grids = sample_grids[:num_source]
|
|
1303
|
+
lengths.append(torch.prod(source_grids, dim=1).sum().item())
|
|
1304
|
+
else:
|
|
1305
|
+
lengths.append(0)
|
|
1306
|
+
|
|
1307
|
+
dict_to_expand[key] = _repeat_interleave_samples(
|
|
1308
|
+
dict_to_expand[key], lengths=lengths, repeat_times=expand_size
|
|
1309
|
+
)
|
|
1199
1310
|
elif key == "image_grid_thw":
|
|
1200
|
-
#
|
|
1201
|
-
lengths = list(image_nums)
|
|
1202
|
-
last_image = dict_to_expand[key][:-1]
|
|
1311
|
+
# Expand all grids (source + target) per sample
|
|
1203
1312
|
dict_to_expand[key] = _repeat_interleave_samples(
|
|
1204
|
-
dict_to_expand[key]
|
|
1313
|
+
dict_to_expand[key], lengths=image_nums, repeat_times=expand_size
|
|
1205
1314
|
)
|
|
1206
|
-
|
|
1315
|
+
elif key == "images_per_sample":
|
|
1316
|
+
# Simply repeat the counts
|
|
1317
|
+
if dict_to_expand.get(key) is not None:
|
|
1318
|
+
dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
|
|
1207
1319
|
return dict_to_expand
|
|
1208
1320
|
|
|
1209
1321
|
def _expand_dict_for_generation(dict_to_expand):
|
|
@@ -1274,11 +1386,11 @@ def smart_resize(
|
|
|
1274
1386
|
|
|
1275
1387
|
|
|
1276
1388
|
class GlmImageImageProcessor(Qwen2VLImageProcessor):
|
|
1277
|
-
|
|
1389
|
+
model_input_names = ["pixel_values", "image_grid_thw", "images_per_sample"]
|
|
1278
1390
|
|
|
1279
1391
|
|
|
1280
1392
|
class GlmImageImageProcessorFast(Qwen2VLImageProcessorFast):
|
|
1281
|
-
|
|
1393
|
+
model_input_names = ["pixel_values", "image_grid_thw", "images_per_sample"]
|
|
1282
1394
|
|
|
1283
1395
|
|
|
1284
1396
|
class GlmImageImagesKwargs(ImagesKwargs, total=False):
|
|
@@ -1321,6 +1433,8 @@ class GlmImageProcessor(ProcessorMixin):
|
|
|
1321
1433
|
in a chat into a tokenizable string.
|
|
1322
1434
|
"""
|
|
1323
1435
|
|
|
1436
|
+
model_input_names = ["input_ids", "attention_mask", "pixel_values", "image_grid_thw", "images_per_sample"]
|
|
1437
|
+
|
|
1324
1438
|
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
|
|
1325
1439
|
self.image_token = tokenizer.image_token
|
|
1326
1440
|
self.grid_bos_token = tokenizer.grid_bos_token
|
|
@@ -1368,6 +1482,7 @@ class GlmImageProcessor(ProcessorMixin):
|
|
|
1368
1482
|
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
|
1369
1483
|
**kwargs,
|
|
1370
1484
|
)
|
|
1485
|
+
|
|
1371
1486
|
target_h = output_kwargs["images_kwargs"].pop("target_h", None)
|
|
1372
1487
|
target_w = output_kwargs["images_kwargs"].pop("target_w", None)
|
|
1373
1488
|
is_text_to_image = images is None
|
|
@@ -1379,16 +1494,27 @@ class GlmImageProcessor(ProcessorMixin):
|
|
|
1379
1494
|
image_inputs = {}
|
|
1380
1495
|
image_grid_thw = None
|
|
1381
1496
|
|
|
1497
|
+
# Handle text=None case (image-only processing)
|
|
1498
|
+
if text is None:
|
|
1499
|
+
if images is None:
|
|
1500
|
+
raise ValueError("You must provide at least one of `text` or `images`.")
|
|
1501
|
+
return image_inputs
|
|
1502
|
+
|
|
1382
1503
|
if not isinstance(text, list):
|
|
1383
1504
|
text = [text]
|
|
1384
1505
|
|
|
1385
|
-
|
|
1386
|
-
raise ValueError("The model does not support batch size > 1")
|
|
1387
|
-
|
|
1506
|
+
batch_size = len(text)
|
|
1388
1507
|
text = text.copy() # below lines change text in-place
|
|
1508
|
+
|
|
1509
|
+
# Count images per sample by counting image tokens in each text
|
|
1510
|
+
images_per_sample = []
|
|
1511
|
+
for i in range(batch_size):
|
|
1512
|
+
images_per_sample.append(text[i].count(self.image_token))
|
|
1513
|
+
|
|
1514
|
+
# Replace image tokens with the correct number of placeholder tokens
|
|
1389
1515
|
if not is_text_to_image:
|
|
1390
1516
|
index = 0
|
|
1391
|
-
for i in range(
|
|
1517
|
+
for i in range(batch_size):
|
|
1392
1518
|
while self.image_token in text[i]:
|
|
1393
1519
|
grid = image_grid_thw[index]
|
|
1394
1520
|
num_image_tokens = int(grid[1] * grid[2])
|
|
@@ -1396,20 +1522,50 @@ class GlmImageProcessor(ProcessorMixin):
|
|
|
1396
1522
|
index += 1
|
|
1397
1523
|
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
|
1398
1524
|
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
)
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1525
|
+
# Build prompt with target shape and combine grids in a single loop
|
|
1526
|
+
# Format: [sample0_source_grids..., sample0_target_grids, sample1_source_grids..., sample1_target_grids, ...]
|
|
1527
|
+
# Note: In i2i mode, batches are homogeneous (same number of source images per sample)
|
|
1528
|
+
num_source_images = images_per_sample[0] if images_per_sample else 0
|
|
1529
|
+
|
|
1530
|
+
# Validate homogeneity for i2i mode
|
|
1531
|
+
if not is_text_to_image and images_per_sample and len(set(images_per_sample)) != 1:
|
|
1532
|
+
raise ValueError(
|
|
1533
|
+
f"In image-to-image mode, all samples must have the same number of source images. "
|
|
1534
|
+
f"Got different counts: {images_per_sample}"
|
|
1535
|
+
)
|
|
1536
|
+
|
|
1537
|
+
all_grids = []
|
|
1538
|
+
for i in range(batch_size):
|
|
1539
|
+
text[i], token_h, token_w, prev_h, prev_w = self._build_prompt_with_target_shape(
|
|
1540
|
+
text[i], height=target_h, width=target_w, is_text_to_image=is_text_to_image
|
|
1541
|
+
)
|
|
1542
|
+
# Add source grids for this sample (i2i mode only)
|
|
1543
|
+
if not is_text_to_image and num_source_images > 0:
|
|
1544
|
+
start_idx = i * num_source_images
|
|
1545
|
+
all_grids.append(image_grid_thw[start_idx : start_idx + num_source_images])
|
|
1546
|
+
# Add target grid for this sample
|
|
1547
|
+
all_grids.append(
|
|
1548
|
+
self._build_target_image_grid_thw(
|
|
1549
|
+
token_h=token_h,
|
|
1550
|
+
token_w=token_w,
|
|
1551
|
+
prev_token_h=prev_h,
|
|
1552
|
+
prev_token_w=prev_w,
|
|
1553
|
+
is_text_to_image=is_text_to_image,
|
|
1554
|
+
)
|
|
1555
|
+
)
|
|
1556
|
+
image_inputs["image_grid_thw"] = torch.cat(all_grids, dim=0)
|
|
1557
|
+
|
|
1558
|
+
# Store images_per_sample for later use (add target images count)
|
|
1559
|
+
# Each sample will have: source_images + target_images (typically 2 for t2i, 1 for i2i)
|
|
1560
|
+
num_target_grids = 2 if is_text_to_image else 1
|
|
1561
|
+
image_inputs["images_per_sample"] = torch.tensor(
|
|
1562
|
+
[num_source_images + num_target_grids] * batch_size, dtype=torch.long
|
|
1408
1563
|
)
|
|
1409
1564
|
|
|
1410
1565
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
|
1411
1566
|
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
|
|
1412
1567
|
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
|
1568
|
+
|
|
1413
1569
|
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
|
1414
1570
|
|
|
1415
1571
|
if return_mm_token_type_ids:
|
|
@@ -1448,9 +1604,10 @@ class GlmImageProcessor(ProcessorMixin):
|
|
|
1448
1604
|
token_w: int,
|
|
1449
1605
|
prev_token_h: int,
|
|
1450
1606
|
prev_token_w: int,
|
|
1451
|
-
|
|
1607
|
+
is_text_to_image: bool = True,
|
|
1452
1608
|
):
|
|
1453
|
-
if
|
|
1609
|
+
if is_text_to_image:
|
|
1610
|
+
# Text-to-image: 2 target grids (large + small preview)
|
|
1454
1611
|
return torch.tensor(
|
|
1455
1612
|
[
|
|
1456
1613
|
[1, token_h, token_w],
|
|
@@ -1458,8 +1615,11 @@ class GlmImageProcessor(ProcessorMixin):
|
|
|
1458
1615
|
],
|
|
1459
1616
|
)
|
|
1460
1617
|
else:
|
|
1461
|
-
|
|
1462
|
-
|
|
1618
|
+
# Image-to-image: 1 target grid only
|
|
1619
|
+
return torch.tensor(
|
|
1620
|
+
[
|
|
1621
|
+
[1, token_h, token_w],
|
|
1622
|
+
],
|
|
1463
1623
|
)
|
|
1464
1624
|
|
|
1465
1625
|
|