transformers 5.0.0rc2__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +11 -37
- transformers/activations.py +2 -2
- transformers/audio_utils.py +32 -32
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +26 -126
- transformers/cli/chat.py +3 -3
- transformers/cli/serve.py +13 -10
- transformers/cli/transformers.py +2 -1
- transformers/configuration_utils.py +22 -92
- transformers/conversion_mapping.py +150 -26
- transformers/convert_slow_tokenizer.py +9 -12
- transformers/core_model_loading.py +217 -129
- transformers/data/processors/glue.py +0 -1
- transformers/data/processors/utils.py +0 -1
- transformers/data/processors/xnli.py +0 -1
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +10 -11
- transformers/distributed/configuration_utils.py +1 -2
- transformers/dynamic_module_utils.py +23 -23
- transformers/feature_extraction_sequence_utils.py +19 -23
- transformers/feature_extraction_utils.py +14 -14
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +2 -4
- transformers/generation/configuration_utils.py +54 -39
- transformers/generation/continuous_batching/__init__.py +0 -1
- transformers/generation/continuous_batching/cache.py +74 -44
- transformers/generation/continuous_batching/cache_manager.py +28 -28
- transformers/generation/continuous_batching/continuous_api.py +133 -414
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +77 -19
- transformers/generation/continuous_batching/scheduler.py +154 -104
- transformers/generation/logits_process.py +10 -133
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/streamers.py +0 -1
- transformers/generation/utils.py +91 -121
- transformers/generation/watermarking.py +2 -3
- transformers/hf_argparser.py +9 -13
- transformers/hyperparameter_search.py +1 -2
- transformers/image_processing_base.py +9 -9
- transformers/image_processing_utils.py +11 -15
- transformers/image_processing_utils_fast.py +70 -71
- transformers/image_transforms.py +73 -42
- transformers/image_utils.py +30 -37
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/awq.py +1 -3
- transformers/integrations/deepspeed.py +146 -4
- transformers/integrations/eetq.py +0 -1
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/fbgemm_fp8.py +1 -2
- transformers/integrations/finegrained_fp8.py +149 -13
- transformers/integrations/flash_attention.py +3 -8
- transformers/integrations/flex_attention.py +1 -1
- transformers/integrations/fp_quant.py +4 -6
- transformers/integrations/ggml.py +0 -1
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/integration_utils.py +2 -3
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +52 -40
- transformers/integrations/peft.py +488 -176
- transformers/integrations/quark.py +2 -4
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/integrations/torchao.py +4 -6
- transformers/loss/loss_lw_detr.py +356 -0
- transformers/loss/loss_utils.py +2 -0
- transformers/masking_utils.py +199 -59
- transformers/model_debugging_utils.py +4 -5
- transformers/modelcard.py +14 -192
- transformers/modeling_attn_mask_utils.py +19 -19
- transformers/modeling_flash_attention_utils.py +28 -29
- transformers/modeling_gguf_pytorch_utils.py +5 -5
- transformers/modeling_layers.py +21 -22
- transformers/modeling_outputs.py +242 -253
- transformers/modeling_rope_utils.py +32 -32
- transformers/modeling_utils.py +416 -438
- transformers/models/__init__.py +10 -0
- transformers/models/afmoe/configuration_afmoe.py +40 -33
- transformers/models/afmoe/modeling_afmoe.py +38 -41
- transformers/models/afmoe/modular_afmoe.py +23 -25
- transformers/models/aimv2/configuration_aimv2.py +2 -10
- transformers/models/aimv2/modeling_aimv2.py +46 -45
- transformers/models/aimv2/modular_aimv2.py +13 -19
- transformers/models/albert/configuration_albert.py +8 -2
- transformers/models/albert/modeling_albert.py +70 -72
- transformers/models/albert/tokenization_albert.py +1 -4
- transformers/models/align/configuration_align.py +8 -6
- transformers/models/align/modeling_align.py +83 -86
- transformers/models/align/processing_align.py +2 -30
- transformers/models/altclip/configuration_altclip.py +4 -7
- transformers/models/altclip/modeling_altclip.py +106 -103
- transformers/models/altclip/processing_altclip.py +2 -15
- transformers/models/apertus/__init__.py +0 -1
- transformers/models/apertus/configuration_apertus.py +23 -28
- transformers/models/apertus/modeling_apertus.py +35 -38
- transformers/models/apertus/modular_apertus.py +36 -40
- transformers/models/arcee/configuration_arcee.py +25 -30
- transformers/models/arcee/modeling_arcee.py +35 -38
- transformers/models/arcee/modular_arcee.py +20 -23
- transformers/models/aria/configuration_aria.py +31 -44
- transformers/models/aria/image_processing_aria.py +25 -27
- transformers/models/aria/modeling_aria.py +102 -102
- transformers/models/aria/modular_aria.py +111 -124
- transformers/models/aria/processing_aria.py +28 -35
- transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +0 -1
- transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +3 -6
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +9 -11
- transformers/models/audioflamingo3/__init__.py +0 -1
- transformers/models/audioflamingo3/configuration_audioflamingo3.py +0 -1
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +60 -52
- transformers/models/audioflamingo3/modular_audioflamingo3.py +52 -43
- transformers/models/audioflamingo3/processing_audioflamingo3.py +6 -8
- transformers/models/auto/auto_factory.py +12 -11
- transformers/models/auto/configuration_auto.py +48 -5
- transformers/models/auto/feature_extraction_auto.py +5 -7
- transformers/models/auto/image_processing_auto.py +30 -39
- transformers/models/auto/modeling_auto.py +33 -199
- transformers/models/auto/processing_auto.py +11 -19
- transformers/models/auto/tokenization_auto.py +38 -37
- transformers/models/auto/video_processing_auto.py +7 -8
- transformers/models/autoformer/configuration_autoformer.py +4 -7
- transformers/models/autoformer/modeling_autoformer.py +100 -101
- transformers/models/aya_vision/configuration_aya_vision.py +4 -1
- transformers/models/aya_vision/modeling_aya_vision.py +64 -99
- transformers/models/aya_vision/modular_aya_vision.py +46 -74
- transformers/models/aya_vision/processing_aya_vision.py +25 -53
- transformers/models/bamba/configuration_bamba.py +46 -39
- transformers/models/bamba/modeling_bamba.py +83 -119
- transformers/models/bamba/modular_bamba.py +70 -109
- transformers/models/bark/configuration_bark.py +6 -8
- transformers/models/bark/generation_configuration_bark.py +3 -5
- transformers/models/bark/modeling_bark.py +64 -65
- transformers/models/bark/processing_bark.py +19 -41
- transformers/models/bart/configuration_bart.py +9 -5
- transformers/models/bart/modeling_bart.py +124 -129
- transformers/models/barthez/tokenization_barthez.py +1 -4
- transformers/models/bartpho/tokenization_bartpho.py +6 -7
- transformers/models/beit/configuration_beit.py +2 -15
- transformers/models/beit/image_processing_beit.py +53 -56
- transformers/models/beit/image_processing_beit_fast.py +11 -12
- transformers/models/beit/modeling_beit.py +65 -62
- transformers/models/bert/configuration_bert.py +12 -2
- transformers/models/bert/modeling_bert.py +117 -152
- transformers/models/bert/tokenization_bert.py +2 -4
- transformers/models/bert/tokenization_bert_legacy.py +3 -5
- transformers/models/bert_generation/configuration_bert_generation.py +17 -2
- transformers/models/bert_generation/modeling_bert_generation.py +53 -55
- transformers/models/bert_generation/tokenization_bert_generation.py +2 -3
- transformers/models/bert_japanese/tokenization_bert_japanese.py +5 -6
- transformers/models/bertweet/tokenization_bertweet.py +1 -3
- transformers/models/big_bird/configuration_big_bird.py +12 -9
- transformers/models/big_bird/modeling_big_bird.py +107 -124
- transformers/models/big_bird/tokenization_big_bird.py +1 -4
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -9
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +118 -118
- transformers/models/biogpt/configuration_biogpt.py +8 -2
- transformers/models/biogpt/modeling_biogpt.py +73 -79
- transformers/models/biogpt/modular_biogpt.py +60 -66
- transformers/models/biogpt/tokenization_biogpt.py +3 -5
- transformers/models/bit/configuration_bit.py +2 -5
- transformers/models/bit/image_processing_bit.py +21 -24
- transformers/models/bit/image_processing_bit_fast.py +0 -1
- transformers/models/bit/modeling_bit.py +15 -16
- transformers/models/bitnet/configuration_bitnet.py +23 -28
- transformers/models/bitnet/modeling_bitnet.py +34 -38
- transformers/models/bitnet/modular_bitnet.py +7 -10
- transformers/models/blenderbot/configuration_blenderbot.py +8 -5
- transformers/models/blenderbot/modeling_blenderbot.py +68 -99
- transformers/models/blenderbot/tokenization_blenderbot.py +0 -1
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -5
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +70 -72
- transformers/models/blenderbot_small/tokenization_blenderbot_small.py +1 -3
- transformers/models/blip/configuration_blip.py +9 -10
- transformers/models/blip/image_processing_blip.py +17 -20
- transformers/models/blip/image_processing_blip_fast.py +0 -1
- transformers/models/blip/modeling_blip.py +115 -108
- transformers/models/blip/modeling_blip_text.py +63 -65
- transformers/models/blip/processing_blip.py +5 -36
- transformers/models/blip_2/configuration_blip_2.py +2 -2
- transformers/models/blip_2/modeling_blip_2.py +145 -121
- transformers/models/blip_2/processing_blip_2.py +8 -38
- transformers/models/bloom/configuration_bloom.py +5 -2
- transformers/models/bloom/modeling_bloom.py +60 -60
- transformers/models/blt/configuration_blt.py +94 -86
- transformers/models/blt/modeling_blt.py +93 -90
- transformers/models/blt/modular_blt.py +127 -69
- transformers/models/bridgetower/configuration_bridgetower.py +7 -2
- transformers/models/bridgetower/image_processing_bridgetower.py +34 -35
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +13 -14
- transformers/models/bridgetower/modeling_bridgetower.py +136 -124
- transformers/models/bridgetower/processing_bridgetower.py +2 -16
- transformers/models/bros/configuration_bros.py +24 -18
- transformers/models/bros/modeling_bros.py +78 -80
- transformers/models/bros/processing_bros.py +2 -12
- transformers/models/byt5/tokenization_byt5.py +4 -6
- transformers/models/camembert/configuration_camembert.py +8 -2
- transformers/models/camembert/modeling_camembert.py +97 -99
- transformers/models/camembert/modular_camembert.py +51 -54
- transformers/models/camembert/tokenization_camembert.py +1 -4
- transformers/models/canine/configuration_canine.py +4 -2
- transformers/models/canine/modeling_canine.py +73 -75
- transformers/models/canine/tokenization_canine.py +0 -1
- transformers/models/chameleon/configuration_chameleon.py +29 -34
- transformers/models/chameleon/image_processing_chameleon.py +21 -24
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -6
- transformers/models/chameleon/modeling_chameleon.py +135 -92
- transformers/models/chameleon/processing_chameleon.py +16 -41
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -8
- transformers/models/chinese_clip/image_processing_chinese_clip.py +21 -24
- transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +0 -1
- transformers/models/chinese_clip/modeling_chinese_clip.py +93 -95
- transformers/models/chinese_clip/processing_chinese_clip.py +2 -15
- transformers/models/clap/configuration_clap.py +4 -9
- transformers/models/clap/feature_extraction_clap.py +9 -10
- transformers/models/clap/modeling_clap.py +109 -111
- transformers/models/clap/processing_clap.py +2 -15
- transformers/models/clip/configuration_clip.py +4 -2
- transformers/models/clip/image_processing_clip.py +21 -24
- transformers/models/clip/image_processing_clip_fast.py +9 -1
- transformers/models/clip/modeling_clip.py +70 -68
- transformers/models/clip/processing_clip.py +2 -14
- transformers/models/clip/tokenization_clip.py +2 -5
- transformers/models/clipseg/configuration_clipseg.py +4 -2
- transformers/models/clipseg/modeling_clipseg.py +113 -112
- transformers/models/clipseg/processing_clipseg.py +19 -42
- transformers/models/clvp/configuration_clvp.py +15 -5
- transformers/models/clvp/feature_extraction_clvp.py +7 -10
- transformers/models/clvp/modeling_clvp.py +138 -145
- transformers/models/clvp/number_normalizer.py +1 -2
- transformers/models/clvp/processing_clvp.py +3 -20
- transformers/models/clvp/tokenization_clvp.py +0 -1
- transformers/models/code_llama/tokenization_code_llama.py +3 -6
- transformers/models/codegen/configuration_codegen.py +4 -4
- transformers/models/codegen/modeling_codegen.py +50 -49
- transformers/models/codegen/tokenization_codegen.py +5 -6
- transformers/models/cohere/configuration_cohere.py +25 -30
- transformers/models/cohere/modeling_cohere.py +39 -42
- transformers/models/cohere/modular_cohere.py +27 -31
- transformers/models/cohere/tokenization_cohere.py +5 -6
- transformers/models/cohere2/configuration_cohere2.py +27 -32
- transformers/models/cohere2/modeling_cohere2.py +38 -41
- transformers/models/cohere2/modular_cohere2.py +48 -52
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +9 -10
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +52 -55
- transformers/models/cohere2_vision/modular_cohere2_vision.py +41 -43
- transformers/models/cohere2_vision/processing_cohere2_vision.py +6 -36
- transformers/models/colpali/configuration_colpali.py +0 -1
- transformers/models/colpali/modeling_colpali.py +14 -16
- transformers/models/colpali/modular_colpali.py +11 -51
- transformers/models/colpali/processing_colpali.py +14 -52
- transformers/models/colqwen2/modeling_colqwen2.py +27 -28
- transformers/models/colqwen2/modular_colqwen2.py +36 -74
- transformers/models/colqwen2/processing_colqwen2.py +16 -52
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -47
- transformers/models/conditional_detr/image_processing_conditional_detr.py +67 -70
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +50 -36
- transformers/models/conditional_detr/modeling_conditional_detr.py +851 -1001
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -5
- transformers/models/convbert/configuration_convbert.py +11 -8
- transformers/models/convbert/modeling_convbert.py +85 -87
- transformers/models/convbert/tokenization_convbert.py +0 -1
- transformers/models/convnext/configuration_convnext.py +2 -5
- transformers/models/convnext/image_processing_convnext.py +18 -21
- transformers/models/convnext/image_processing_convnext_fast.py +7 -8
- transformers/models/convnext/modeling_convnext.py +12 -14
- transformers/models/convnextv2/configuration_convnextv2.py +2 -5
- transformers/models/convnextv2/modeling_convnextv2.py +12 -14
- transformers/models/cpm/tokenization_cpm.py +6 -7
- transformers/models/cpm/tokenization_cpm_fast.py +3 -5
- transformers/models/cpmant/configuration_cpmant.py +4 -1
- transformers/models/cpmant/modeling_cpmant.py +38 -40
- transformers/models/cpmant/tokenization_cpmant.py +1 -3
- transformers/models/csm/configuration_csm.py +58 -66
- transformers/models/csm/generation_csm.py +13 -14
- transformers/models/csm/modeling_csm.py +81 -84
- transformers/models/csm/modular_csm.py +56 -58
- transformers/models/csm/processing_csm.py +25 -68
- transformers/models/ctrl/configuration_ctrl.py +16 -1
- transformers/models/ctrl/modeling_ctrl.py +51 -66
- transformers/models/ctrl/tokenization_ctrl.py +0 -1
- transformers/models/cvt/configuration_cvt.py +0 -1
- transformers/models/cvt/modeling_cvt.py +13 -15
- transformers/models/cwm/__init__.py +0 -1
- transformers/models/cwm/configuration_cwm.py +8 -12
- transformers/models/cwm/modeling_cwm.py +36 -38
- transformers/models/cwm/modular_cwm.py +10 -12
- transformers/models/d_fine/configuration_d_fine.py +10 -57
- transformers/models/d_fine/modeling_d_fine.py +786 -927
- transformers/models/d_fine/modular_d_fine.py +339 -417
- transformers/models/dab_detr/configuration_dab_detr.py +22 -49
- transformers/models/dab_detr/modeling_dab_detr.py +79 -77
- transformers/models/dac/configuration_dac.py +0 -1
- transformers/models/dac/feature_extraction_dac.py +6 -9
- transformers/models/dac/modeling_dac.py +22 -24
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -2
- transformers/models/data2vec/configuration_data2vec_text.py +11 -3
- transformers/models/data2vec/configuration_data2vec_vision.py +0 -1
- transformers/models/data2vec/modeling_data2vec_audio.py +55 -59
- transformers/models/data2vec/modeling_data2vec_text.py +97 -99
- transformers/models/data2vec/modeling_data2vec_vision.py +45 -44
- transformers/models/data2vec/modular_data2vec_audio.py +6 -1
- transformers/models/data2vec/modular_data2vec_text.py +51 -54
- transformers/models/dbrx/configuration_dbrx.py +29 -22
- transformers/models/dbrx/modeling_dbrx.py +45 -48
- transformers/models/dbrx/modular_dbrx.py +37 -39
- transformers/models/deberta/configuration_deberta.py +6 -1
- transformers/models/deberta/modeling_deberta.py +57 -60
- transformers/models/deberta/tokenization_deberta.py +2 -5
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -1
- transformers/models/deberta_v2/modeling_deberta_v2.py +63 -65
- transformers/models/deberta_v2/tokenization_deberta_v2.py +1 -4
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -2
- transformers/models/decision_transformer/modeling_decision_transformer.py +51 -53
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +41 -47
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +39 -41
- transformers/models/deepseek_v2/modular_deepseek_v2.py +48 -52
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +42 -48
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +38 -40
- transformers/models/deepseek_v3/modular_deepseek_v3.py +10 -10
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +6 -3
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +27 -28
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +12 -11
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +48 -43
- transformers/models/deepseek_vl/modular_deepseek_vl.py +15 -43
- transformers/models/deepseek_vl/processing_deepseek_vl.py +10 -41
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +7 -5
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +37 -37
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +22 -22
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +100 -56
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +141 -109
- transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +12 -44
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -46
- transformers/models/deformable_detr/image_processing_deformable_detr.py +59 -61
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +42 -28
- transformers/models/deformable_detr/modeling_deformable_detr.py +454 -652
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -5
- transformers/models/deit/configuration_deit.py +0 -1
- transformers/models/deit/image_processing_deit.py +18 -21
- transformers/models/deit/image_processing_deit_fast.py +0 -1
- transformers/models/deit/modeling_deit.py +27 -25
- transformers/models/depth_anything/configuration_depth_anything.py +12 -43
- transformers/models/depth_anything/modeling_depth_anything.py +10 -11
- transformers/models/depth_pro/configuration_depth_pro.py +0 -1
- transformers/models/depth_pro/image_processing_depth_pro.py +22 -23
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +8 -9
- transformers/models/depth_pro/modeling_depth_pro.py +29 -27
- transformers/models/detr/configuration_detr.py +18 -50
- transformers/models/detr/image_processing_detr.py +64 -66
- transformers/models/detr/image_processing_detr_fast.py +33 -34
- transformers/models/detr/modeling_detr.py +748 -789
- transformers/models/dia/configuration_dia.py +9 -15
- transformers/models/dia/feature_extraction_dia.py +6 -9
- transformers/models/dia/generation_dia.py +48 -53
- transformers/models/dia/modeling_dia.py +68 -71
- transformers/models/dia/modular_dia.py +56 -58
- transformers/models/dia/processing_dia.py +39 -29
- transformers/models/dia/tokenization_dia.py +3 -6
- transformers/models/diffllama/configuration_diffllama.py +25 -30
- transformers/models/diffllama/modeling_diffllama.py +45 -53
- transformers/models/diffllama/modular_diffllama.py +18 -25
- transformers/models/dinat/configuration_dinat.py +2 -5
- transformers/models/dinat/modeling_dinat.py +47 -48
- transformers/models/dinov2/configuration_dinov2.py +2 -5
- transformers/models/dinov2/modeling_dinov2.py +20 -21
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +3 -5
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +21 -21
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +11 -14
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +6 -11
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +5 -9
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +7 -12
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +7 -8
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +19 -22
- transformers/models/dinov3_vit/modular_dinov3_vit.py +16 -19
- transformers/models/distilbert/configuration_distilbert.py +8 -2
- transformers/models/distilbert/modeling_distilbert.py +47 -49
- transformers/models/distilbert/tokenization_distilbert.py +0 -1
- transformers/models/doge/__init__.py +0 -1
- transformers/models/doge/configuration_doge.py +42 -35
- transformers/models/doge/modeling_doge.py +46 -49
- transformers/models/doge/modular_doge.py +77 -68
- transformers/models/donut/configuration_donut_swin.py +0 -1
- transformers/models/donut/image_processing_donut.py +26 -29
- transformers/models/donut/image_processing_donut_fast.py +9 -14
- transformers/models/donut/modeling_donut_swin.py +44 -46
- transformers/models/donut/processing_donut.py +5 -26
- transformers/models/dots1/configuration_dots1.py +43 -36
- transformers/models/dots1/modeling_dots1.py +35 -38
- transformers/models/dots1/modular_dots1.py +0 -1
- transformers/models/dpr/configuration_dpr.py +19 -2
- transformers/models/dpr/modeling_dpr.py +37 -39
- transformers/models/dpr/tokenization_dpr.py +7 -9
- transformers/models/dpr/tokenization_dpr_fast.py +7 -9
- transformers/models/dpt/configuration_dpt.py +23 -66
- transformers/models/dpt/image_processing_dpt.py +65 -66
- transformers/models/dpt/image_processing_dpt_fast.py +18 -19
- transformers/models/dpt/modeling_dpt.py +38 -36
- transformers/models/dpt/modular_dpt.py +14 -15
- transformers/models/edgetam/configuration_edgetam.py +1 -2
- transformers/models/edgetam/modeling_edgetam.py +87 -89
- transformers/models/edgetam/modular_edgetam.py +7 -13
- transformers/models/edgetam_video/__init__.py +0 -1
- transformers/models/edgetam_video/configuration_edgetam_video.py +0 -1
- transformers/models/edgetam_video/modeling_edgetam_video.py +126 -128
- transformers/models/edgetam_video/modular_edgetam_video.py +25 -27
- transformers/models/efficientloftr/configuration_efficientloftr.py +4 -5
- transformers/models/efficientloftr/image_processing_efficientloftr.py +14 -16
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +8 -7
- transformers/models/efficientloftr/modeling_efficientloftr.py +46 -38
- transformers/models/efficientloftr/modular_efficientloftr.py +1 -3
- transformers/models/efficientnet/configuration_efficientnet.py +0 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +23 -26
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +16 -17
- transformers/models/efficientnet/modeling_efficientnet.py +12 -14
- transformers/models/electra/configuration_electra.py +13 -3
- transformers/models/electra/modeling_electra.py +107 -109
- transformers/models/emu3/configuration_emu3.py +17 -17
- transformers/models/emu3/image_processing_emu3.py +44 -39
- transformers/models/emu3/modeling_emu3.py +143 -109
- transformers/models/emu3/modular_emu3.py +109 -73
- transformers/models/emu3/processing_emu3.py +18 -43
- transformers/models/encodec/configuration_encodec.py +2 -4
- transformers/models/encodec/feature_extraction_encodec.py +10 -13
- transformers/models/encodec/modeling_encodec.py +25 -29
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -2
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +37 -43
- transformers/models/eomt/configuration_eomt.py +12 -14
- transformers/models/eomt/image_processing_eomt.py +53 -55
- transformers/models/eomt/image_processing_eomt_fast.py +18 -19
- transformers/models/eomt/modeling_eomt.py +19 -21
- transformers/models/eomt/modular_eomt.py +28 -30
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -3
- transformers/models/ernie/modeling_ernie.py +127 -162
- transformers/models/ernie/modular_ernie.py +91 -103
- transformers/models/ernie4_5/configuration_ernie4_5.py +23 -27
- transformers/models/ernie4_5/modeling_ernie4_5.py +35 -37
- transformers/models/ernie4_5/modular_ernie4_5.py +1 -3
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +34 -39
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +40 -42
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +7 -9
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -7
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +34 -35
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +6 -7
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +305 -267
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +163 -142
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +3 -5
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +17 -18
- transformers/models/esm/configuration_esm.py +11 -15
- transformers/models/esm/modeling_esm.py +35 -37
- transformers/models/esm/modeling_esmfold.py +43 -50
- transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
- transformers/models/esm/openfold_utils/loss.py +1 -2
- transformers/models/esm/openfold_utils/protein.py +15 -16
- transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
- transformers/models/esm/tokenization_esm.py +2 -4
- transformers/models/evolla/configuration_evolla.py +50 -40
- transformers/models/evolla/modeling_evolla.py +69 -68
- transformers/models/evolla/modular_evolla.py +50 -48
- transformers/models/evolla/processing_evolla.py +23 -35
- transformers/models/exaone4/configuration_exaone4.py +27 -27
- transformers/models/exaone4/modeling_exaone4.py +36 -39
- transformers/models/exaone4/modular_exaone4.py +51 -50
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +31 -26
- transformers/models/falcon/modeling_falcon.py +76 -84
- transformers/models/falcon_h1/configuration_falcon_h1.py +57 -51
- transformers/models/falcon_h1/modeling_falcon_h1.py +74 -109
- transformers/models/falcon_h1/modular_falcon_h1.py +68 -100
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +64 -73
- transformers/models/falcon_mamba/modular_falcon_mamba.py +14 -13
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +70 -97
- transformers/models/fast_vlm/modular_fast_vlm.py +148 -38
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +2 -6
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +45 -47
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -3
- transformers/models/flaubert/configuration_flaubert.py +10 -5
- transformers/models/flaubert/modeling_flaubert.py +125 -129
- transformers/models/flaubert/tokenization_flaubert.py +3 -5
- transformers/models/flava/configuration_flava.py +9 -9
- transformers/models/flava/image_processing_flava.py +66 -67
- transformers/models/flava/image_processing_flava_fast.py +46 -47
- transformers/models/flava/modeling_flava.py +144 -135
- transformers/models/flava/processing_flava.py +2 -12
- transformers/models/flex_olmo/__init__.py +0 -1
- transformers/models/flex_olmo/configuration_flex_olmo.py +34 -39
- transformers/models/flex_olmo/modeling_flex_olmo.py +41 -43
- transformers/models/flex_olmo/modular_flex_olmo.py +46 -51
- transformers/models/florence2/configuration_florence2.py +4 -1
- transformers/models/florence2/modeling_florence2.py +96 -72
- transformers/models/florence2/modular_florence2.py +100 -107
- transformers/models/florence2/processing_florence2.py +18 -47
- transformers/models/fnet/configuration_fnet.py +6 -2
- transformers/models/fnet/modeling_fnet.py +69 -80
- transformers/models/fnet/tokenization_fnet.py +0 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -5
- transformers/models/focalnet/modeling_focalnet.py +49 -48
- transformers/models/fsmt/configuration_fsmt.py +12 -17
- transformers/models/fsmt/modeling_fsmt.py +47 -48
- transformers/models/fsmt/tokenization_fsmt.py +3 -5
- transformers/models/funnel/configuration_funnel.py +8 -1
- transformers/models/funnel/modeling_funnel.py +91 -93
- transformers/models/funnel/tokenization_funnel.py +2 -5
- transformers/models/fuyu/configuration_fuyu.py +28 -34
- transformers/models/fuyu/image_processing_fuyu.py +29 -31
- transformers/models/fuyu/image_processing_fuyu_fast.py +17 -17
- transformers/models/fuyu/modeling_fuyu.py +50 -52
- transformers/models/fuyu/processing_fuyu.py +9 -36
- transformers/models/gemma/configuration_gemma.py +25 -30
- transformers/models/gemma/modeling_gemma.py +36 -38
- transformers/models/gemma/modular_gemma.py +33 -36
- transformers/models/gemma/tokenization_gemma.py +3 -6
- transformers/models/gemma2/configuration_gemma2.py +30 -35
- transformers/models/gemma2/modeling_gemma2.py +38 -41
- transformers/models/gemma2/modular_gemma2.py +63 -67
- transformers/models/gemma3/configuration_gemma3.py +53 -48
- transformers/models/gemma3/image_processing_gemma3.py +29 -31
- transformers/models/gemma3/image_processing_gemma3_fast.py +11 -12
- transformers/models/gemma3/modeling_gemma3.py +123 -122
- transformers/models/gemma3/modular_gemma3.py +128 -125
- transformers/models/gemma3/processing_gemma3.py +5 -5
- transformers/models/gemma3n/configuration_gemma3n.py +42 -30
- transformers/models/gemma3n/feature_extraction_gemma3n.py +9 -11
- transformers/models/gemma3n/modeling_gemma3n.py +166 -147
- transformers/models/gemma3n/modular_gemma3n.py +176 -148
- transformers/models/gemma3n/processing_gemma3n.py +12 -26
- transformers/models/git/configuration_git.py +5 -8
- transformers/models/git/modeling_git.py +115 -127
- transformers/models/git/processing_git.py +2 -14
- transformers/models/glm/configuration_glm.py +26 -30
- transformers/models/glm/modeling_glm.py +36 -39
- transformers/models/glm/modular_glm.py +4 -7
- transformers/models/glm4/configuration_glm4.py +26 -30
- transformers/models/glm4/modeling_glm4.py +39 -41
- transformers/models/glm4/modular_glm4.py +8 -10
- transformers/models/glm46v/configuration_glm46v.py +4 -1
- transformers/models/glm46v/image_processing_glm46v.py +40 -38
- transformers/models/glm46v/image_processing_glm46v_fast.py +9 -9
- transformers/models/glm46v/modeling_glm46v.py +138 -93
- transformers/models/glm46v/modular_glm46v.py +5 -3
- transformers/models/glm46v/processing_glm46v.py +7 -41
- transformers/models/glm46v/video_processing_glm46v.py +9 -11
- transformers/models/glm4_moe/configuration_glm4_moe.py +42 -35
- transformers/models/glm4_moe/modeling_glm4_moe.py +36 -39
- transformers/models/glm4_moe/modular_glm4_moe.py +43 -36
- transformers/models/glm4_moe_lite/__init__.py +28 -0
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +233 -0
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +740 -0
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +302 -0
- transformers/models/glm4v/configuration_glm4v.py +25 -24
- transformers/models/glm4v/image_processing_glm4v.py +39 -38
- transformers/models/glm4v/image_processing_glm4v_fast.py +8 -9
- transformers/models/glm4v/modeling_glm4v.py +249 -210
- transformers/models/glm4v/modular_glm4v.py +211 -230
- transformers/models/glm4v/processing_glm4v.py +7 -41
- transformers/models/glm4v/video_processing_glm4v.py +9 -11
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +136 -127
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +348 -356
- transformers/models/glm4v_moe/modular_glm4v_moe.py +76 -174
- transformers/models/glm_image/__init__.py +31 -0
- transformers/models/glm_image/configuration_glm_image.py +358 -0
- transformers/models/glm_image/image_processing_glm_image.py +503 -0
- transformers/models/glm_image/image_processing_glm_image_fast.py +294 -0
- transformers/models/glm_image/modeling_glm_image.py +1691 -0
- transformers/models/glm_image/modular_glm_image.py +1640 -0
- transformers/models/glm_image/processing_glm_image.py +265 -0
- transformers/models/glm_ocr/__init__.py +28 -0
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/__init__.py +0 -1
- transformers/models/glmasr/configuration_glmasr.py +0 -1
- transformers/models/glmasr/modeling_glmasr.py +51 -46
- transformers/models/glmasr/modular_glmasr.py +39 -29
- transformers/models/glmasr/processing_glmasr.py +7 -8
- transformers/models/glpn/configuration_glpn.py +0 -1
- transformers/models/glpn/image_processing_glpn.py +11 -12
- transformers/models/glpn/image_processing_glpn_fast.py +11 -12
- transformers/models/glpn/modeling_glpn.py +14 -14
- transformers/models/got_ocr2/configuration_got_ocr2.py +10 -13
- transformers/models/got_ocr2/image_processing_got_ocr2.py +22 -24
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +9 -10
- transformers/models/got_ocr2/modeling_got_ocr2.py +69 -77
- transformers/models/got_ocr2/modular_got_ocr2.py +60 -52
- transformers/models/got_ocr2/processing_got_ocr2.py +42 -63
- transformers/models/gpt2/configuration_gpt2.py +13 -2
- transformers/models/gpt2/modeling_gpt2.py +111 -113
- transformers/models/gpt2/tokenization_gpt2.py +6 -9
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -2
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +78 -84
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -2
- transformers/models/gpt_neo/modeling_gpt_neo.py +66 -71
- transformers/models/gpt_neox/configuration_gpt_neox.py +27 -25
- transformers/models/gpt_neox/modeling_gpt_neox.py +74 -76
- transformers/models/gpt_neox/modular_gpt_neox.py +68 -70
- transformers/models/gpt_neox/tokenization_gpt_neox.py +2 -5
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +24 -19
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +43 -46
- transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +1 -3
- transformers/models/gpt_oss/configuration_gpt_oss.py +31 -30
- transformers/models/gpt_oss/modeling_gpt_oss.py +80 -114
- transformers/models/gpt_oss/modular_gpt_oss.py +62 -97
- transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
- transformers/models/gptj/configuration_gptj.py +4 -5
- transformers/models/gptj/modeling_gptj.py +85 -88
- transformers/models/granite/configuration_granite.py +28 -33
- transformers/models/granite/modeling_granite.py +43 -45
- transformers/models/granite/modular_granite.py +29 -31
- transformers/models/granite_speech/configuration_granite_speech.py +0 -1
- transformers/models/granite_speech/feature_extraction_granite_speech.py +1 -3
- transformers/models/granite_speech/modeling_granite_speech.py +84 -60
- transformers/models/granite_speech/processing_granite_speech.py +11 -4
- transformers/models/granitemoe/configuration_granitemoe.py +31 -36
- transformers/models/granitemoe/modeling_granitemoe.py +39 -41
- transformers/models/granitemoe/modular_granitemoe.py +21 -23
- transformers/models/granitemoehybrid/__init__.py +0 -1
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +55 -48
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +82 -118
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +57 -65
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +33 -37
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +52 -56
- transformers/models/granitemoeshared/modular_granitemoeshared.py +19 -21
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -46
- transformers/models/grounding_dino/image_processing_grounding_dino.py +60 -62
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +28 -29
- transformers/models/grounding_dino/modeling_grounding_dino.py +161 -181
- transformers/models/grounding_dino/modular_grounding_dino.py +2 -3
- transformers/models/grounding_dino/processing_grounding_dino.py +10 -38
- transformers/models/groupvit/configuration_groupvit.py +4 -2
- transformers/models/groupvit/modeling_groupvit.py +98 -92
- transformers/models/helium/configuration_helium.py +25 -29
- transformers/models/helium/modeling_helium.py +37 -40
- transformers/models/helium/modular_helium.py +3 -7
- transformers/models/herbert/tokenization_herbert.py +4 -6
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -5
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +12 -14
- transformers/models/hgnet_v2/modular_hgnet_v2.py +13 -17
- transformers/models/hiera/configuration_hiera.py +2 -5
- transformers/models/hiera/modeling_hiera.py +71 -70
- transformers/models/hubert/configuration_hubert.py +4 -2
- transformers/models/hubert/modeling_hubert.py +42 -41
- transformers/models/hubert/modular_hubert.py +8 -11
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +26 -31
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +58 -37
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +31 -11
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +31 -36
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +54 -44
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +27 -15
- transformers/models/ibert/configuration_ibert.py +4 -2
- transformers/models/ibert/modeling_ibert.py +60 -62
- transformers/models/ibert/quant_modules.py +0 -1
- transformers/models/idefics/configuration_idefics.py +5 -8
- transformers/models/idefics/image_processing_idefics.py +13 -15
- transformers/models/idefics/modeling_idefics.py +63 -65
- transformers/models/idefics/perceiver.py +1 -3
- transformers/models/idefics/processing_idefics.py +32 -48
- transformers/models/idefics/vision.py +27 -28
- transformers/models/idefics2/configuration_idefics2.py +1 -3
- transformers/models/idefics2/image_processing_idefics2.py +31 -32
- transformers/models/idefics2/image_processing_idefics2_fast.py +8 -8
- transformers/models/idefics2/modeling_idefics2.py +126 -106
- transformers/models/idefics2/processing_idefics2.py +10 -68
- transformers/models/idefics3/configuration_idefics3.py +1 -4
- transformers/models/idefics3/image_processing_idefics3.py +42 -43
- transformers/models/idefics3/image_processing_idefics3_fast.py +40 -15
- transformers/models/idefics3/modeling_idefics3.py +113 -92
- transformers/models/idefics3/processing_idefics3.py +15 -69
- transformers/models/ijepa/configuration_ijepa.py +0 -1
- transformers/models/ijepa/modeling_ijepa.py +13 -14
- transformers/models/ijepa/modular_ijepa.py +5 -7
- transformers/models/imagegpt/configuration_imagegpt.py +9 -2
- transformers/models/imagegpt/image_processing_imagegpt.py +17 -18
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +10 -11
- transformers/models/imagegpt/modeling_imagegpt.py +65 -62
- transformers/models/informer/configuration_informer.py +6 -9
- transformers/models/informer/modeling_informer.py +87 -89
- transformers/models/informer/modular_informer.py +13 -16
- transformers/models/instructblip/configuration_instructblip.py +2 -2
- transformers/models/instructblip/modeling_instructblip.py +104 -79
- transformers/models/instructblip/processing_instructblip.py +10 -36
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -2
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +108 -105
- transformers/models/instructblipvideo/modular_instructblipvideo.py +73 -64
- transformers/models/instructblipvideo/processing_instructblipvideo.py +14 -33
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +6 -7
- transformers/models/internvl/configuration_internvl.py +5 -1
- transformers/models/internvl/modeling_internvl.py +76 -98
- transformers/models/internvl/modular_internvl.py +45 -59
- transformers/models/internvl/processing_internvl.py +12 -45
- transformers/models/internvl/video_processing_internvl.py +10 -11
- transformers/models/jais2/configuration_jais2.py +25 -29
- transformers/models/jais2/modeling_jais2.py +36 -38
- transformers/models/jais2/modular_jais2.py +20 -22
- transformers/models/jamba/configuration_jamba.py +5 -8
- transformers/models/jamba/modeling_jamba.py +47 -50
- transformers/models/jamba/modular_jamba.py +40 -41
- transformers/models/janus/configuration_janus.py +0 -1
- transformers/models/janus/image_processing_janus.py +37 -39
- transformers/models/janus/image_processing_janus_fast.py +20 -21
- transformers/models/janus/modeling_janus.py +103 -188
- transformers/models/janus/modular_janus.py +122 -83
- transformers/models/janus/processing_janus.py +17 -43
- transformers/models/jetmoe/configuration_jetmoe.py +26 -27
- transformers/models/jetmoe/modeling_jetmoe.py +42 -45
- transformers/models/jetmoe/modular_jetmoe.py +33 -36
- transformers/models/kosmos2/configuration_kosmos2.py +10 -9
- transformers/models/kosmos2/modeling_kosmos2.py +199 -178
- transformers/models/kosmos2/processing_kosmos2.py +40 -55
- transformers/models/kosmos2_5/__init__.py +0 -1
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -9
- transformers/models/kosmos2_5/image_processing_kosmos2_5.py +10 -12
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -11
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +162 -172
- transformers/models/kosmos2_5/processing_kosmos2_5.py +8 -29
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +31 -28
- transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +12 -14
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +103 -106
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +20 -22
- transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +2 -8
- transformers/models/lasr/configuration_lasr.py +3 -7
- transformers/models/lasr/feature_extraction_lasr.py +10 -12
- transformers/models/lasr/modeling_lasr.py +21 -24
- transformers/models/lasr/modular_lasr.py +11 -13
- transformers/models/lasr/processing_lasr.py +12 -6
- transformers/models/lasr/tokenization_lasr.py +2 -4
- transformers/models/layoutlm/configuration_layoutlm.py +14 -2
- transformers/models/layoutlm/modeling_layoutlm.py +70 -72
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -17
- transformers/models/layoutlmv2/image_processing_layoutlmv2.py +18 -21
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +7 -8
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +48 -50
- transformers/models/layoutlmv2/processing_layoutlmv2.py +14 -44
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +63 -74
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -19
- transformers/models/layoutlmv3/image_processing_layoutlmv3.py +24 -26
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +9 -10
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +49 -51
- transformers/models/layoutlmv3/processing_layoutlmv3.py +14 -46
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +64 -75
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -17
- transformers/models/layoutxlm/modular_layoutxlm.py +0 -1
- transformers/models/layoutxlm/processing_layoutxlm.py +14 -44
- transformers/models/layoutxlm/tokenization_layoutxlm.py +65 -76
- transformers/models/led/configuration_led.py +8 -12
- transformers/models/led/modeling_led.py +113 -267
- transformers/models/levit/configuration_levit.py +0 -1
- transformers/models/levit/image_processing_levit.py +19 -21
- transformers/models/levit/image_processing_levit_fast.py +4 -5
- transformers/models/levit/modeling_levit.py +17 -19
- transformers/models/lfm2/configuration_lfm2.py +27 -30
- transformers/models/lfm2/modeling_lfm2.py +46 -48
- transformers/models/lfm2/modular_lfm2.py +32 -32
- transformers/models/lfm2_moe/__init__.py +0 -1
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +6 -9
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +48 -49
- transformers/models/lfm2_moe/modular_lfm2_moe.py +8 -9
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -1
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +43 -20
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +73 -61
- transformers/models/lfm2_vl/modular_lfm2_vl.py +66 -54
- transformers/models/lfm2_vl/processing_lfm2_vl.py +14 -34
- transformers/models/lightglue/image_processing_lightglue.py +16 -15
- transformers/models/lightglue/image_processing_lightglue_fast.py +8 -7
- transformers/models/lightglue/modeling_lightglue.py +31 -33
- transformers/models/lightglue/modular_lightglue.py +31 -31
- transformers/models/lighton_ocr/__init__.py +28 -0
- transformers/models/lighton_ocr/configuration_lighton_ocr.py +128 -0
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +463 -0
- transformers/models/lighton_ocr/modular_lighton_ocr.py +404 -0
- transformers/models/lighton_ocr/processing_lighton_ocr.py +229 -0
- transformers/models/lilt/configuration_lilt.py +6 -2
- transformers/models/lilt/modeling_lilt.py +53 -55
- transformers/models/llama/configuration_llama.py +26 -31
- transformers/models/llama/modeling_llama.py +35 -38
- transformers/models/llama/tokenization_llama.py +2 -4
- transformers/models/llama4/configuration_llama4.py +87 -69
- transformers/models/llama4/image_processing_llama4_fast.py +11 -12
- transformers/models/llama4/modeling_llama4.py +116 -115
- transformers/models/llama4/processing_llama4.py +33 -57
- transformers/models/llava/configuration_llava.py +10 -1
- transformers/models/llava/image_processing_llava.py +25 -28
- transformers/models/llava/image_processing_llava_fast.py +9 -10
- transformers/models/llava/modeling_llava.py +73 -102
- transformers/models/llava/processing_llava.py +18 -51
- transformers/models/llava_next/configuration_llava_next.py +2 -2
- transformers/models/llava_next/image_processing_llava_next.py +43 -45
- transformers/models/llava_next/image_processing_llava_next_fast.py +11 -12
- transformers/models/llava_next/modeling_llava_next.py +103 -104
- transformers/models/llava_next/processing_llava_next.py +18 -47
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -7
- transformers/models/llava_next_video/modeling_llava_next_video.py +168 -155
- transformers/models/llava_next_video/modular_llava_next_video.py +154 -147
- transformers/models/llava_next_video/processing_llava_next_video.py +21 -63
- transformers/models/llava_next_video/video_processing_llava_next_video.py +0 -1
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -7
- transformers/models/llava_onevision/image_processing_llava_onevision.py +40 -42
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +14 -14
- transformers/models/llava_onevision/modeling_llava_onevision.py +170 -166
- transformers/models/llava_onevision/modular_llava_onevision.py +156 -152
- transformers/models/llava_onevision/processing_llava_onevision.py +21 -53
- transformers/models/llava_onevision/video_processing_llava_onevision.py +0 -1
- transformers/models/longcat_flash/__init__.py +0 -1
- transformers/models/longcat_flash/configuration_longcat_flash.py +39 -45
- transformers/models/longcat_flash/modeling_longcat_flash.py +37 -38
- transformers/models/longcat_flash/modular_longcat_flash.py +23 -24
- transformers/models/longformer/configuration_longformer.py +5 -5
- transformers/models/longformer/modeling_longformer.py +99 -101
- transformers/models/longt5/configuration_longt5.py +9 -7
- transformers/models/longt5/modeling_longt5.py +45 -45
- transformers/models/luke/configuration_luke.py +8 -2
- transformers/models/luke/modeling_luke.py +179 -181
- transformers/models/luke/tokenization_luke.py +99 -105
- transformers/{pipelines/deprecated → models/lw_detr}/__init__.py +14 -3
- transformers/models/lw_detr/configuration_lw_detr.py +362 -0
- transformers/models/lw_detr/modeling_lw_detr.py +1697 -0
- transformers/models/lw_detr/modular_lw_detr.py +1609 -0
- transformers/models/lxmert/configuration_lxmert.py +16 -1
- transformers/models/lxmert/modeling_lxmert.py +63 -74
- transformers/models/m2m_100/configuration_m2m_100.py +7 -9
- transformers/models/m2m_100/modeling_m2m_100.py +72 -74
- transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
- transformers/models/mamba/configuration_mamba.py +5 -3
- transformers/models/mamba/modeling_mamba.py +61 -70
- transformers/models/mamba2/configuration_mamba2.py +5 -8
- transformers/models/mamba2/modeling_mamba2.py +66 -79
- transformers/models/marian/configuration_marian.py +10 -5
- transformers/models/marian/modeling_marian.py +88 -90
- transformers/models/marian/tokenization_marian.py +6 -6
- transformers/models/markuplm/configuration_markuplm.py +4 -7
- transformers/models/markuplm/feature_extraction_markuplm.py +1 -2
- transformers/models/markuplm/modeling_markuplm.py +63 -65
- transformers/models/markuplm/processing_markuplm.py +31 -38
- transformers/models/markuplm/tokenization_markuplm.py +67 -77
- transformers/models/mask2former/configuration_mask2former.py +14 -52
- transformers/models/mask2former/image_processing_mask2former.py +84 -85
- transformers/models/mask2former/image_processing_mask2former_fast.py +36 -36
- transformers/models/mask2former/modeling_mask2former.py +108 -104
- transformers/models/mask2former/modular_mask2former.py +6 -8
- transformers/models/maskformer/configuration_maskformer.py +17 -51
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -5
- transformers/models/maskformer/image_processing_maskformer.py +84 -85
- transformers/models/maskformer/image_processing_maskformer_fast.py +35 -36
- transformers/models/maskformer/modeling_maskformer.py +71 -67
- transformers/models/maskformer/modeling_maskformer_swin.py +20 -23
- transformers/models/mbart/configuration_mbart.py +9 -5
- transformers/models/mbart/modeling_mbart.py +120 -119
- transformers/models/mbart/tokenization_mbart.py +2 -4
- transformers/models/mbart50/tokenization_mbart50.py +3 -5
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -3
- transformers/models/megatron_bert/modeling_megatron_bert.py +139 -165
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +94 -87
- transformers/models/metaclip_2/modular_metaclip_2.py +59 -45
- transformers/models/mgp_str/configuration_mgp_str.py +0 -1
- transformers/models/mgp_str/modeling_mgp_str.py +18 -18
- transformers/models/mgp_str/processing_mgp_str.py +3 -20
- transformers/models/mgp_str/tokenization_mgp_str.py +1 -3
- transformers/models/mimi/configuration_mimi.py +42 -40
- transformers/models/mimi/modeling_mimi.py +116 -115
- transformers/models/minimax/__init__.py +0 -1
- transformers/models/minimax/configuration_minimax.py +40 -47
- transformers/models/minimax/modeling_minimax.py +46 -49
- transformers/models/minimax/modular_minimax.py +59 -65
- transformers/models/minimax_m2/__init__.py +28 -0
- transformers/models/minimax_m2/configuration_minimax_m2.py +188 -0
- transformers/models/minimax_m2/modeling_minimax_m2.py +704 -0
- transformers/models/minimax_m2/modular_minimax_m2.py +346 -0
- transformers/models/ministral/configuration_ministral.py +25 -29
- transformers/models/ministral/modeling_ministral.py +35 -37
- transformers/models/ministral/modular_ministral.py +32 -37
- transformers/models/ministral3/configuration_ministral3.py +23 -26
- transformers/models/ministral3/modeling_ministral3.py +35 -37
- transformers/models/ministral3/modular_ministral3.py +7 -8
- transformers/models/mistral/configuration_mistral.py +24 -29
- transformers/models/mistral/modeling_mistral.py +35 -37
- transformers/models/mistral/modular_mistral.py +14 -15
- transformers/models/mistral3/configuration_mistral3.py +4 -1
- transformers/models/mistral3/modeling_mistral3.py +79 -82
- transformers/models/mistral3/modular_mistral3.py +66 -67
- transformers/models/mixtral/configuration_mixtral.py +32 -38
- transformers/models/mixtral/modeling_mixtral.py +39 -42
- transformers/models/mixtral/modular_mixtral.py +26 -29
- transformers/models/mlcd/configuration_mlcd.py +0 -1
- transformers/models/mlcd/modeling_mlcd.py +17 -17
- transformers/models/mlcd/modular_mlcd.py +16 -16
- transformers/models/mllama/configuration_mllama.py +10 -15
- transformers/models/mllama/image_processing_mllama.py +23 -25
- transformers/models/mllama/image_processing_mllama_fast.py +11 -11
- transformers/models/mllama/modeling_mllama.py +100 -103
- transformers/models/mllama/processing_mllama.py +6 -55
- transformers/models/mluke/tokenization_mluke.py +97 -103
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -46
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +159 -179
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -46
- transformers/models/mobilebert/configuration_mobilebert.py +4 -2
- transformers/models/mobilebert/modeling_mobilebert.py +78 -88
- transformers/models/mobilebert/tokenization_mobilebert.py +0 -1
- transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +0 -1
- transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +20 -23
- transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +0 -1
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +13 -16
- transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +0 -1
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +48 -51
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +14 -15
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +21 -22
- transformers/models/mobilevit/configuration_mobilevit.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +41 -44
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +12 -13
- transformers/models/mobilevit/modeling_mobilevit.py +21 -21
- transformers/models/mobilevitv2/configuration_mobilevitv2.py +0 -1
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +21 -22
- transformers/models/modernbert/configuration_modernbert.py +76 -51
- transformers/models/modernbert/modeling_modernbert.py +188 -943
- transformers/models/modernbert/modular_modernbert.py +255 -978
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +50 -44
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +54 -64
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +92 -92
- transformers/models/moonshine/configuration_moonshine.py +34 -31
- transformers/models/moonshine/modeling_moonshine.py +70 -72
- transformers/models/moonshine/modular_moonshine.py +91 -86
- transformers/models/moshi/configuration_moshi.py +46 -23
- transformers/models/moshi/modeling_moshi.py +134 -142
- transformers/models/mpnet/configuration_mpnet.py +6 -2
- transformers/models/mpnet/modeling_mpnet.py +55 -57
- transformers/models/mpnet/tokenization_mpnet.py +1 -4
- transformers/models/mpt/configuration_mpt.py +17 -9
- transformers/models/mpt/modeling_mpt.py +58 -60
- transformers/models/mra/configuration_mra.py +8 -2
- transformers/models/mra/modeling_mra.py +54 -56
- transformers/models/mt5/configuration_mt5.py +9 -6
- transformers/models/mt5/modeling_mt5.py +80 -85
- transformers/models/musicgen/configuration_musicgen.py +12 -8
- transformers/models/musicgen/modeling_musicgen.py +114 -116
- transformers/models/musicgen/processing_musicgen.py +3 -21
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -8
- transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +8 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +113 -126
- transformers/models/musicgen_melody/processing_musicgen_melody.py +3 -22
- transformers/models/mvp/configuration_mvp.py +8 -5
- transformers/models/mvp/modeling_mvp.py +121 -123
- transformers/models/myt5/tokenization_myt5.py +8 -10
- transformers/models/nanochat/configuration_nanochat.py +5 -8
- transformers/models/nanochat/modeling_nanochat.py +36 -39
- transformers/models/nanochat/modular_nanochat.py +16 -18
- transformers/models/nemotron/configuration_nemotron.py +25 -30
- transformers/models/nemotron/modeling_nemotron.py +53 -66
- transformers/models/nllb/tokenization_nllb.py +14 -14
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -10
- transformers/models/nllb_moe/modeling_nllb_moe.py +70 -72
- transformers/models/nougat/image_processing_nougat.py +29 -32
- transformers/models/nougat/image_processing_nougat_fast.py +12 -13
- transformers/models/nougat/processing_nougat.py +37 -39
- transformers/models/nougat/tokenization_nougat.py +5 -7
- transformers/models/nystromformer/configuration_nystromformer.py +8 -2
- transformers/models/nystromformer/modeling_nystromformer.py +61 -63
- transformers/models/olmo/configuration_olmo.py +23 -28
- transformers/models/olmo/modeling_olmo.py +35 -38
- transformers/models/olmo/modular_olmo.py +8 -12
- transformers/models/olmo2/configuration_olmo2.py +27 -32
- transformers/models/olmo2/modeling_olmo2.py +36 -39
- transformers/models/olmo2/modular_olmo2.py +36 -38
- transformers/models/olmo3/__init__.py +0 -1
- transformers/models/olmo3/configuration_olmo3.py +30 -34
- transformers/models/olmo3/modeling_olmo3.py +35 -38
- transformers/models/olmo3/modular_olmo3.py +44 -47
- transformers/models/olmoe/configuration_olmoe.py +29 -33
- transformers/models/olmoe/modeling_olmoe.py +41 -43
- transformers/models/olmoe/modular_olmoe.py +15 -16
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -50
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +59 -57
- transformers/models/omdet_turbo/processing_omdet_turbo.py +19 -67
- transformers/models/oneformer/configuration_oneformer.py +11 -51
- transformers/models/oneformer/image_processing_oneformer.py +83 -84
- transformers/models/oneformer/image_processing_oneformer_fast.py +41 -42
- transformers/models/oneformer/modeling_oneformer.py +137 -133
- transformers/models/oneformer/processing_oneformer.py +28 -43
- transformers/models/openai/configuration_openai.py +16 -1
- transformers/models/openai/modeling_openai.py +50 -51
- transformers/models/openai/tokenization_openai.py +2 -5
- transformers/models/opt/configuration_opt.py +6 -7
- transformers/models/opt/modeling_opt.py +79 -80
- transformers/models/ovis2/__init__.py +0 -1
- transformers/models/ovis2/configuration_ovis2.py +4 -1
- transformers/models/ovis2/image_processing_ovis2.py +22 -24
- transformers/models/ovis2/image_processing_ovis2_fast.py +9 -10
- transformers/models/ovis2/modeling_ovis2.py +99 -142
- transformers/models/ovis2/modular_ovis2.py +82 -45
- transformers/models/ovis2/processing_ovis2.py +12 -40
- transformers/models/owlv2/configuration_owlv2.py +4 -2
- transformers/models/owlv2/image_processing_owlv2.py +20 -21
- transformers/models/owlv2/image_processing_owlv2_fast.py +12 -13
- transformers/models/owlv2/modeling_owlv2.py +122 -114
- transformers/models/owlv2/modular_owlv2.py +11 -12
- transformers/models/owlv2/processing_owlv2.py +20 -49
- transformers/models/owlvit/configuration_owlvit.py +4 -2
- transformers/models/owlvit/image_processing_owlvit.py +21 -22
- transformers/models/owlvit/image_processing_owlvit_fast.py +2 -3
- transformers/models/owlvit/modeling_owlvit.py +121 -113
- transformers/models/owlvit/processing_owlvit.py +20 -48
- transformers/models/paddleocr_vl/__init__.py +0 -1
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +28 -29
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +34 -35
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +12 -12
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +159 -158
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +148 -119
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +1 -3
- transformers/models/paligemma/configuration_paligemma.py +4 -1
- transformers/models/paligemma/modeling_paligemma.py +81 -79
- transformers/models/paligemma/processing_paligemma.py +13 -66
- transformers/models/parakeet/configuration_parakeet.py +3 -8
- transformers/models/parakeet/feature_extraction_parakeet.py +10 -12
- transformers/models/parakeet/modeling_parakeet.py +21 -25
- transformers/models/parakeet/modular_parakeet.py +19 -21
- transformers/models/parakeet/processing_parakeet.py +12 -5
- transformers/models/parakeet/tokenization_parakeet.py +2 -4
- transformers/models/patchtsmixer/configuration_patchtsmixer.py +5 -8
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +63 -65
- transformers/models/patchtst/configuration_patchtst.py +6 -9
- transformers/models/patchtst/modeling_patchtst.py +75 -77
- transformers/models/pe_audio/__init__.py +0 -1
- transformers/models/pe_audio/configuration_pe_audio.py +14 -16
- transformers/models/pe_audio/feature_extraction_pe_audio.py +6 -8
- transformers/models/pe_audio/modeling_pe_audio.py +30 -31
- transformers/models/pe_audio/modular_pe_audio.py +17 -18
- transformers/models/pe_audio/processing_pe_audio.py +0 -1
- transformers/models/pe_audio_video/__init__.py +0 -1
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +15 -17
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +64 -65
- transformers/models/pe_audio_video/modular_pe_audio_video.py +56 -57
- transformers/models/pe_audio_video/processing_pe_audio_video.py +0 -1
- transformers/models/pe_video/__init__.py +0 -1
- transformers/models/pe_video/configuration_pe_video.py +14 -16
- transformers/models/pe_video/modeling_pe_video.py +57 -46
- transformers/models/pe_video/modular_pe_video.py +47 -35
- transformers/models/pe_video/video_processing_pe_video.py +2 -4
- transformers/models/pegasus/configuration_pegasus.py +8 -6
- transformers/models/pegasus/modeling_pegasus.py +67 -69
- transformers/models/pegasus/tokenization_pegasus.py +1 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -4
- transformers/models/pegasus_x/modeling_pegasus_x.py +53 -55
- transformers/models/perceiver/configuration_perceiver.py +0 -1
- transformers/models/perceiver/image_processing_perceiver.py +22 -25
- transformers/models/perceiver/image_processing_perceiver_fast.py +7 -8
- transformers/models/perceiver/modeling_perceiver.py +152 -145
- transformers/models/perceiver/tokenization_perceiver.py +3 -6
- transformers/models/perception_lm/configuration_perception_lm.py +0 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +8 -9
- transformers/models/perception_lm/modeling_perception_lm.py +64 -67
- transformers/models/perception_lm/modular_perception_lm.py +58 -58
- transformers/models/perception_lm/processing_perception_lm.py +13 -47
- transformers/models/perception_lm/video_processing_perception_lm.py +0 -1
- transformers/models/persimmon/configuration_persimmon.py +23 -28
- transformers/models/persimmon/modeling_persimmon.py +44 -47
- transformers/models/phi/configuration_phi.py +27 -28
- transformers/models/phi/modeling_phi.py +39 -41
- transformers/models/phi/modular_phi.py +26 -26
- transformers/models/phi3/configuration_phi3.py +32 -37
- transformers/models/phi3/modeling_phi3.py +37 -40
- transformers/models/phi3/modular_phi3.py +16 -20
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +36 -39
- transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +7 -9
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +11 -11
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +100 -117
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +103 -90
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +7 -42
- transformers/models/phimoe/configuration_phimoe.py +31 -36
- transformers/models/phimoe/modeling_phimoe.py +50 -77
- transformers/models/phimoe/modular_phimoe.py +12 -8
- transformers/models/phobert/tokenization_phobert.py +4 -6
- transformers/models/pix2struct/configuration_pix2struct.py +12 -10
- transformers/models/pix2struct/image_processing_pix2struct.py +15 -19
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +12 -15
- transformers/models/pix2struct/modeling_pix2struct.py +56 -52
- transformers/models/pix2struct/processing_pix2struct.py +5 -26
- transformers/models/pixio/__init__.py +0 -1
- transformers/models/pixio/configuration_pixio.py +2 -5
- transformers/models/pixio/modeling_pixio.py +16 -17
- transformers/models/pixio/modular_pixio.py +7 -8
- transformers/models/pixtral/configuration_pixtral.py +11 -14
- transformers/models/pixtral/image_processing_pixtral.py +26 -28
- transformers/models/pixtral/image_processing_pixtral_fast.py +10 -11
- transformers/models/pixtral/modeling_pixtral.py +31 -37
- transformers/models/pixtral/processing_pixtral.py +18 -52
- transformers/models/plbart/configuration_plbart.py +8 -6
- transformers/models/plbart/modeling_plbart.py +109 -109
- transformers/models/plbart/modular_plbart.py +31 -33
- transformers/models/plbart/tokenization_plbart.py +4 -5
- transformers/models/poolformer/configuration_poolformer.py +0 -1
- transformers/models/poolformer/image_processing_poolformer.py +21 -24
- transformers/models/poolformer/image_processing_poolformer_fast.py +13 -14
- transformers/models/poolformer/modeling_poolformer.py +10 -12
- transformers/models/pop2piano/configuration_pop2piano.py +7 -7
- transformers/models/pop2piano/feature_extraction_pop2piano.py +6 -9
- transformers/models/pop2piano/modeling_pop2piano.py +24 -24
- transformers/models/pop2piano/processing_pop2piano.py +25 -33
- transformers/models/pop2piano/tokenization_pop2piano.py +15 -23
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +13 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +20 -21
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +17 -16
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +21 -20
- transformers/models/prophetnet/configuration_prophetnet.py +37 -38
- transformers/models/prophetnet/modeling_prophetnet.py +121 -153
- transformers/models/prophetnet/tokenization_prophetnet.py +14 -16
- transformers/models/pvt/configuration_pvt.py +0 -1
- transformers/models/pvt/image_processing_pvt.py +24 -27
- transformers/models/pvt/image_processing_pvt_fast.py +1 -2
- transformers/models/pvt/modeling_pvt.py +19 -21
- transformers/models/pvt_v2/configuration_pvt_v2.py +4 -8
- transformers/models/pvt_v2/modeling_pvt_v2.py +27 -28
- transformers/models/qwen2/configuration_qwen2.py +32 -25
- transformers/models/qwen2/modeling_qwen2.py +35 -37
- transformers/models/qwen2/modular_qwen2.py +14 -15
- transformers/models/qwen2/tokenization_qwen2.py +2 -9
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +36 -27
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +241 -214
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +228 -193
- transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +41 -49
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +28 -34
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +188 -145
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +64 -91
- transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +7 -43
- transformers/models/qwen2_audio/configuration_qwen2_audio.py +0 -1
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +39 -41
- transformers/models/qwen2_audio/processing_qwen2_audio.py +13 -42
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +42 -35
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +40 -43
- transformers/models/qwen2_moe/modular_qwen2_moe.py +10 -13
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +28 -33
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +38 -40
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +12 -15
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +184 -141
- transformers/models/qwen2_vl/processing_qwen2_vl.py +7 -44
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +38 -18
- transformers/models/qwen3/configuration_qwen3.py +34 -27
- transformers/models/qwen3/modeling_qwen3.py +35 -38
- transformers/models/qwen3/modular_qwen3.py +7 -9
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +45 -35
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +40 -43
- transformers/models/qwen3_moe/modular_qwen3_moe.py +10 -13
- transformers/models/qwen3_next/configuration_qwen3_next.py +47 -38
- transformers/models/qwen3_next/modeling_qwen3_next.py +44 -47
- transformers/models/qwen3_next/modular_qwen3_next.py +37 -38
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +139 -106
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +266 -206
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +228 -181
- transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +40 -48
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +22 -24
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +185 -122
- transformers/models/qwen3_vl/modular_qwen3_vl.py +153 -139
- transformers/models/qwen3_vl/processing_qwen3_vl.py +6 -42
- transformers/models/qwen3_vl/video_processing_qwen3_vl.py +10 -12
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +27 -30
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +249 -178
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +55 -42
- transformers/models/rag/configuration_rag.py +6 -7
- transformers/models/rag/modeling_rag.py +119 -121
- transformers/models/rag/retrieval_rag.py +3 -5
- transformers/models/rag/tokenization_rag.py +0 -50
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +29 -30
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +35 -39
- transformers/models/reformer/configuration_reformer.py +7 -8
- transformers/models/reformer/modeling_reformer.py +67 -68
- transformers/models/reformer/tokenization_reformer.py +3 -6
- transformers/models/regnet/configuration_regnet.py +0 -1
- transformers/models/regnet/modeling_regnet.py +7 -9
- transformers/models/rembert/configuration_rembert.py +8 -2
- transformers/models/rembert/modeling_rembert.py +108 -132
- transformers/models/rembert/tokenization_rembert.py +1 -4
- transformers/models/resnet/configuration_resnet.py +2 -5
- transformers/models/resnet/modeling_resnet.py +14 -15
- transformers/models/roberta/configuration_roberta.py +11 -3
- transformers/models/roberta/modeling_roberta.py +97 -99
- transformers/models/roberta/modular_roberta.py +55 -58
- transformers/models/roberta/tokenization_roberta.py +2 -5
- transformers/models/roberta/tokenization_roberta_old.py +2 -4
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -3
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +97 -99
- transformers/models/roc_bert/configuration_roc_bert.py +8 -2
- transformers/models/roc_bert/modeling_roc_bert.py +125 -162
- transformers/models/roc_bert/tokenization_roc_bert.py +88 -94
- transformers/models/roformer/configuration_roformer.py +13 -3
- transformers/models/roformer/modeling_roformer.py +79 -95
- transformers/models/roformer/tokenization_roformer.py +3 -6
- transformers/models/roformer/tokenization_utils.py +0 -1
- transformers/models/rt_detr/configuration_rt_detr.py +8 -50
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -5
- transformers/models/rt_detr/image_processing_rt_detr.py +54 -55
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +39 -26
- transformers/models/rt_detr/modeling_rt_detr.py +643 -804
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +4 -7
- transformers/models/rt_detr/modular_rt_detr.py +1522 -20
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -58
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +384 -521
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +27 -70
- transformers/models/rwkv/configuration_rwkv.py +2 -4
- transformers/models/rwkv/modeling_rwkv.py +29 -54
- transformers/models/sam/configuration_sam.py +2 -1
- transformers/models/sam/image_processing_sam.py +59 -60
- transformers/models/sam/image_processing_sam_fast.py +25 -26
- transformers/models/sam/modeling_sam.py +46 -43
- transformers/models/sam/processing_sam.py +39 -27
- transformers/models/sam2/configuration_sam2.py +1 -2
- transformers/models/sam2/image_processing_sam2_fast.py +14 -15
- transformers/models/sam2/modeling_sam2.py +96 -94
- transformers/models/sam2/modular_sam2.py +85 -94
- transformers/models/sam2/processing_sam2.py +31 -47
- transformers/models/sam2_video/configuration_sam2_video.py +0 -1
- transformers/models/sam2_video/modeling_sam2_video.py +114 -116
- transformers/models/sam2_video/modular_sam2_video.py +72 -89
- transformers/models/sam2_video/processing_sam2_video.py +49 -66
- transformers/models/sam2_video/video_processing_sam2_video.py +1 -4
- transformers/models/sam3/configuration_sam3.py +0 -1
- transformers/models/sam3/image_processing_sam3_fast.py +17 -20
- transformers/models/sam3/modeling_sam3.py +94 -100
- transformers/models/sam3/modular_sam3.py +3 -8
- transformers/models/sam3/processing_sam3.py +37 -52
- transformers/models/sam3_tracker/__init__.py +0 -1
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -3
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +79 -80
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -2
- transformers/models/sam3_tracker/processing_sam3_tracker.py +31 -48
- transformers/models/sam3_tracker_video/__init__.py +0 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +115 -114
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -24
- transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +50 -66
- transformers/models/sam3_video/configuration_sam3_video.py +0 -1
- transformers/models/sam3_video/modeling_sam3_video.py +56 -45
- transformers/models/sam3_video/processing_sam3_video.py +25 -45
- transformers/models/sam_hq/__init__.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +2 -1
- transformers/models/sam_hq/modeling_sam_hq.py +52 -50
- transformers/models/sam_hq/modular_sam_hq.py +23 -25
- transformers/models/sam_hq/{processing_samhq.py → processing_sam_hq.py} +41 -29
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -10
- transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +8 -11
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +180 -182
- transformers/models/seamless_m4t/processing_seamless_m4t.py +18 -39
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +15 -20
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -10
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +193 -195
- transformers/models/seed_oss/configuration_seed_oss.py +30 -34
- transformers/models/seed_oss/modeling_seed_oss.py +34 -36
- transformers/models/seed_oss/modular_seed_oss.py +6 -7
- transformers/models/segformer/configuration_segformer.py +0 -10
- transformers/models/segformer/image_processing_segformer.py +39 -42
- transformers/models/segformer/image_processing_segformer_fast.py +11 -12
- transformers/models/segformer/modeling_segformer.py +28 -28
- transformers/models/segformer/modular_segformer.py +8 -9
- transformers/models/seggpt/configuration_seggpt.py +0 -1
- transformers/models/seggpt/image_processing_seggpt.py +38 -41
- transformers/models/seggpt/modeling_seggpt.py +48 -38
- transformers/models/sew/configuration_sew.py +4 -2
- transformers/models/sew/modeling_sew.py +42 -40
- transformers/models/sew/modular_sew.py +12 -13
- transformers/models/sew_d/configuration_sew_d.py +4 -2
- transformers/models/sew_d/modeling_sew_d.py +32 -31
- transformers/models/shieldgemma2/configuration_shieldgemma2.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +19 -21
- transformers/models/shieldgemma2/processing_shieldgemma2.py +3 -5
- transformers/models/siglip/configuration_siglip.py +4 -2
- transformers/models/siglip/image_processing_siglip.py +17 -20
- transformers/models/siglip/image_processing_siglip_fast.py +0 -1
- transformers/models/siglip/modeling_siglip.py +65 -110
- transformers/models/siglip/processing_siglip.py +2 -14
- transformers/models/siglip/tokenization_siglip.py +6 -7
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2.py +15 -16
- transformers/models/siglip2/image_processing_siglip2_fast.py +6 -7
- transformers/models/siglip2/modeling_siglip2.py +89 -130
- transformers/models/siglip2/modular_siglip2.py +95 -48
- transformers/models/siglip2/processing_siglip2.py +2 -14
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +29 -32
- transformers/models/smollm3/modeling_smollm3.py +35 -38
- transformers/models/smollm3/modular_smollm3.py +36 -38
- transformers/models/smolvlm/configuration_smolvlm.py +2 -4
- transformers/models/smolvlm/image_processing_smolvlm.py +42 -43
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +41 -15
- transformers/models/smolvlm/modeling_smolvlm.py +124 -96
- transformers/models/smolvlm/modular_smolvlm.py +50 -39
- transformers/models/smolvlm/processing_smolvlm.py +15 -76
- transformers/models/smolvlm/video_processing_smolvlm.py +16 -17
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +0 -1
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +26 -27
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -9
- transformers/models/speech_to_text/feature_extraction_speech_to_text.py +10 -13
- transformers/models/speech_to_text/modeling_speech_to_text.py +55 -57
- transformers/models/speech_to_text/processing_speech_to_text.py +4 -30
- transformers/models/speech_to_text/tokenization_speech_to_text.py +5 -6
- transformers/models/speecht5/configuration_speecht5.py +7 -9
- transformers/models/speecht5/feature_extraction_speecht5.py +16 -37
- transformers/models/speecht5/modeling_speecht5.py +172 -174
- transformers/models/speecht5/number_normalizer.py +0 -1
- transformers/models/speecht5/processing_speecht5.py +3 -37
- transformers/models/speecht5/tokenization_speecht5.py +4 -5
- transformers/models/splinter/configuration_splinter.py +6 -7
- transformers/models/splinter/modeling_splinter.py +62 -59
- transformers/models/splinter/tokenization_splinter.py +2 -4
- transformers/models/squeezebert/configuration_squeezebert.py +14 -2
- transformers/models/squeezebert/modeling_squeezebert.py +60 -62
- transformers/models/squeezebert/tokenization_squeezebert.py +0 -1
- transformers/models/stablelm/configuration_stablelm.py +28 -29
- transformers/models/stablelm/modeling_stablelm.py +44 -47
- transformers/models/starcoder2/configuration_starcoder2.py +30 -27
- transformers/models/starcoder2/modeling_starcoder2.py +38 -41
- transformers/models/starcoder2/modular_starcoder2.py +17 -19
- transformers/models/superglue/configuration_superglue.py +7 -3
- transformers/models/superglue/image_processing_superglue.py +15 -15
- transformers/models/superglue/image_processing_superglue_fast.py +8 -8
- transformers/models/superglue/modeling_superglue.py +41 -37
- transformers/models/superpoint/image_processing_superpoint.py +15 -15
- transformers/models/superpoint/image_processing_superpoint_fast.py +7 -9
- transformers/models/superpoint/modeling_superpoint.py +17 -16
- transformers/models/swiftformer/configuration_swiftformer.py +0 -1
- transformers/models/swiftformer/modeling_swiftformer.py +12 -14
- transformers/models/swin/configuration_swin.py +2 -5
- transformers/models/swin/modeling_swin.py +69 -78
- transformers/models/swin2sr/configuration_swin2sr.py +0 -1
- transformers/models/swin2sr/image_processing_swin2sr.py +10 -13
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +4 -7
- transformers/models/swin2sr/modeling_swin2sr.py +30 -30
- transformers/models/swinv2/configuration_swinv2.py +2 -5
- transformers/models/swinv2/modeling_swinv2.py +65 -74
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -7
- transformers/models/switch_transformers/modeling_switch_transformers.py +35 -36
- transformers/models/switch_transformers/modular_switch_transformers.py +32 -33
- transformers/models/t5/configuration_t5.py +9 -9
- transformers/models/t5/modeling_t5.py +80 -85
- transformers/models/t5/tokenization_t5.py +1 -3
- transformers/models/t5gemma/configuration_t5gemma.py +43 -59
- transformers/models/t5gemma/modeling_t5gemma.py +105 -108
- transformers/models/t5gemma/modular_t5gemma.py +128 -142
- transformers/models/t5gemma2/configuration_t5gemma2.py +86 -100
- transformers/models/t5gemma2/modeling_t5gemma2.py +234 -194
- transformers/models/t5gemma2/modular_t5gemma2.py +279 -264
- transformers/models/table_transformer/configuration_table_transformer.py +18 -50
- transformers/models/table_transformer/modeling_table_transformer.py +73 -101
- transformers/models/tapas/configuration_tapas.py +12 -2
- transformers/models/tapas/modeling_tapas.py +65 -67
- transformers/models/tapas/tokenization_tapas.py +116 -153
- transformers/models/textnet/configuration_textnet.py +4 -7
- transformers/models/textnet/image_processing_textnet.py +22 -25
- transformers/models/textnet/image_processing_textnet_fast.py +8 -9
- transformers/models/textnet/modeling_textnet.py +28 -28
- transformers/models/time_series_transformer/configuration_time_series_transformer.py +5 -8
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +82 -84
- transformers/models/timesfm/configuration_timesfm.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +22 -25
- transformers/models/timesfm/modular_timesfm.py +21 -24
- transformers/models/timesformer/configuration_timesformer.py +0 -1
- transformers/models/timesformer/modeling_timesformer.py +13 -16
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -8
- transformers/models/timm_backbone/modeling_timm_backbone.py +25 -30
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +2 -3
- transformers/models/timm_wrapper/image_processing_timm_wrapper.py +4 -5
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +22 -19
- transformers/models/trocr/configuration_trocr.py +11 -8
- transformers/models/trocr/modeling_trocr.py +42 -42
- transformers/models/trocr/processing_trocr.py +5 -25
- transformers/models/tvp/configuration_tvp.py +10 -36
- transformers/models/tvp/image_processing_tvp.py +50 -52
- transformers/models/tvp/image_processing_tvp_fast.py +15 -15
- transformers/models/tvp/modeling_tvp.py +26 -28
- transformers/models/tvp/processing_tvp.py +2 -14
- transformers/models/udop/configuration_udop.py +16 -8
- transformers/models/udop/modeling_udop.py +73 -72
- transformers/models/udop/processing_udop.py +7 -26
- transformers/models/udop/tokenization_udop.py +80 -93
- transformers/models/umt5/configuration_umt5.py +8 -7
- transformers/models/umt5/modeling_umt5.py +87 -84
- transformers/models/unispeech/configuration_unispeech.py +4 -2
- transformers/models/unispeech/modeling_unispeech.py +54 -53
- transformers/models/unispeech/modular_unispeech.py +20 -22
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -2
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +70 -69
- transformers/models/unispeech_sat/modular_unispeech_sat.py +21 -23
- transformers/models/univnet/feature_extraction_univnet.py +14 -14
- transformers/models/univnet/modeling_univnet.py +7 -8
- transformers/models/upernet/configuration_upernet.py +8 -36
- transformers/models/upernet/modeling_upernet.py +11 -14
- transformers/models/vaultgemma/__init__.py +0 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +29 -33
- transformers/models/vaultgemma/modeling_vaultgemma.py +38 -40
- transformers/models/vaultgemma/modular_vaultgemma.py +29 -31
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3.py +40 -40
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +12 -14
- transformers/models/video_llama_3/modeling_video_llama_3.py +149 -112
- transformers/models/video_llama_3/modular_video_llama_3.py +152 -150
- transformers/models/video_llama_3/processing_video_llama_3.py +5 -39
- transformers/models/video_llama_3/video_processing_video_llama_3.py +45 -24
- transformers/models/video_llava/configuration_video_llava.py +4 -1
- transformers/models/video_llava/image_processing_video_llava.py +35 -38
- transformers/models/video_llava/modeling_video_llava.py +139 -143
- transformers/models/video_llava/processing_video_llava.py +38 -78
- transformers/models/video_llava/video_processing_video_llava.py +0 -1
- transformers/models/videomae/configuration_videomae.py +0 -1
- transformers/models/videomae/image_processing_videomae.py +31 -34
- transformers/models/videomae/modeling_videomae.py +17 -20
- transformers/models/videomae/video_processing_videomae.py +0 -1
- transformers/models/vilt/configuration_vilt.py +4 -2
- transformers/models/vilt/image_processing_vilt.py +29 -30
- transformers/models/vilt/image_processing_vilt_fast.py +15 -16
- transformers/models/vilt/modeling_vilt.py +103 -90
- transformers/models/vilt/processing_vilt.py +2 -14
- transformers/models/vipllava/configuration_vipllava.py +4 -1
- transformers/models/vipllava/modeling_vipllava.py +92 -67
- transformers/models/vipllava/modular_vipllava.py +78 -54
- transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +0 -1
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +28 -27
- transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +0 -1
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +45 -41
- transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +2 -16
- transformers/models/visual_bert/configuration_visual_bert.py +6 -2
- transformers/models/visual_bert/modeling_visual_bert.py +90 -92
- transformers/models/vit/configuration_vit.py +2 -3
- transformers/models/vit/image_processing_vit.py +19 -22
- transformers/models/vit/image_processing_vit_fast.py +0 -1
- transformers/models/vit/modeling_vit.py +20 -20
- transformers/models/vit_mae/configuration_vit_mae.py +0 -1
- transformers/models/vit_mae/modeling_vit_mae.py +32 -30
- transformers/models/vit_msn/configuration_vit_msn.py +0 -1
- transformers/models/vit_msn/modeling_vit_msn.py +21 -19
- transformers/models/vitdet/configuration_vitdet.py +2 -5
- transformers/models/vitdet/modeling_vitdet.py +14 -17
- transformers/models/vitmatte/configuration_vitmatte.py +7 -39
- transformers/models/vitmatte/image_processing_vitmatte.py +15 -18
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +16 -17
- transformers/models/vitmatte/modeling_vitmatte.py +10 -12
- transformers/models/vitpose/configuration_vitpose.py +7 -47
- transformers/models/vitpose/image_processing_vitpose.py +24 -25
- transformers/models/vitpose/image_processing_vitpose_fast.py +9 -10
- transformers/models/vitpose/modeling_vitpose.py +15 -15
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -5
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +13 -16
- transformers/models/vits/configuration_vits.py +4 -1
- transformers/models/vits/modeling_vits.py +43 -42
- transformers/models/vits/tokenization_vits.py +3 -4
- transformers/models/vivit/configuration_vivit.py +0 -1
- transformers/models/vivit/image_processing_vivit.py +36 -39
- transformers/models/vivit/modeling_vivit.py +9 -11
- transformers/models/vjepa2/__init__.py +0 -1
- transformers/models/vjepa2/configuration_vjepa2.py +0 -1
- transformers/models/vjepa2/modeling_vjepa2.py +39 -41
- transformers/models/vjepa2/video_processing_vjepa2.py +0 -1
- transformers/models/voxtral/__init__.py +0 -1
- transformers/models/voxtral/configuration_voxtral.py +0 -2
- transformers/models/voxtral/modeling_voxtral.py +41 -48
- transformers/models/voxtral/modular_voxtral.py +35 -38
- transformers/models/voxtral/processing_voxtral.py +25 -48
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -2
- transformers/models/wav2vec2/feature_extraction_wav2vec2.py +7 -10
- transformers/models/wav2vec2/modeling_wav2vec2.py +74 -126
- transformers/models/wav2vec2/processing_wav2vec2.py +6 -35
- transformers/models/wav2vec2/tokenization_wav2vec2.py +20 -332
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +49 -52
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +45 -48
- transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +6 -35
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -2
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +62 -65
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +15 -18
- transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +16 -17
- transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +36 -55
- transformers/models/wavlm/configuration_wavlm.py +4 -2
- transformers/models/wavlm/modeling_wavlm.py +49 -49
- transformers/models/wavlm/modular_wavlm.py +4 -5
- transformers/models/whisper/configuration_whisper.py +6 -5
- transformers/models/whisper/english_normalizer.py +3 -4
- transformers/models/whisper/feature_extraction_whisper.py +9 -24
- transformers/models/whisper/generation_whisper.py +26 -49
- transformers/models/whisper/modeling_whisper.py +71 -73
- transformers/models/whisper/processing_whisper.py +3 -20
- transformers/models/whisper/tokenization_whisper.py +9 -30
- transformers/models/x_clip/configuration_x_clip.py +4 -2
- transformers/models/x_clip/modeling_x_clip.py +94 -96
- transformers/models/x_clip/processing_x_clip.py +2 -14
- transformers/models/xcodec/configuration_xcodec.py +4 -6
- transformers/models/xcodec/modeling_xcodec.py +15 -17
- transformers/models/xglm/configuration_xglm.py +9 -8
- transformers/models/xglm/modeling_xglm.py +49 -55
- transformers/models/xglm/tokenization_xglm.py +1 -4
- transformers/models/xlm/configuration_xlm.py +10 -8
- transformers/models/xlm/modeling_xlm.py +127 -131
- transformers/models/xlm/tokenization_xlm.py +3 -5
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -3
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +96 -98
- transformers/models/xlm_roberta/modular_xlm_roberta.py +50 -53
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +1 -4
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -2
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +97 -99
- transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +67 -70
- transformers/models/xlnet/configuration_xlnet.py +3 -12
- transformers/models/xlnet/modeling_xlnet.py +149 -162
- transformers/models/xlnet/tokenization_xlnet.py +1 -4
- transformers/models/xlstm/configuration_xlstm.py +8 -12
- transformers/models/xlstm/modeling_xlstm.py +61 -96
- transformers/models/xmod/configuration_xmod.py +11 -3
- transformers/models/xmod/modeling_xmod.py +111 -116
- transformers/models/yolos/configuration_yolos.py +0 -1
- transformers/models/yolos/image_processing_yolos.py +60 -62
- transformers/models/yolos/image_processing_yolos_fast.py +42 -45
- transformers/models/yolos/modeling_yolos.py +19 -21
- transformers/models/yolos/modular_yolos.py +17 -19
- transformers/models/yoso/configuration_yoso.py +8 -2
- transformers/models/yoso/modeling_yoso.py +60 -62
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -8
- transformers/models/zamba/modeling_zamba.py +93 -125
- transformers/models/zamba2/configuration_zamba2.py +44 -50
- transformers/models/zamba2/modeling_zamba2.py +137 -165
- transformers/models/zamba2/modular_zamba2.py +79 -74
- transformers/models/zoedepth/configuration_zoedepth.py +17 -41
- transformers/models/zoedepth/image_processing_zoedepth.py +28 -29
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +20 -21
- transformers/models/zoedepth/modeling_zoedepth.py +19 -19
- transformers/pipelines/__init__.py +47 -106
- transformers/pipelines/any_to_any.py +15 -23
- transformers/pipelines/audio_utils.py +1 -2
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +13 -17
- transformers/pipelines/image_text_to_text.py +1 -2
- transformers/pipelines/question_answering.py +4 -43
- transformers/pipelines/text_classification.py +1 -14
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/pipelines/token_classification.py +1 -22
- transformers/pipelines/video_classification.py +1 -9
- transformers/pipelines/zero_shot_audio_classification.py +0 -1
- transformers/pipelines/zero_shot_classification.py +0 -6
- transformers/pipelines/zero_shot_image_classification.py +0 -7
- transformers/processing_utils.py +128 -137
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/base.py +10 -0
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_quark.py +0 -1
- transformers/quantizers/quantizer_torchao.py +3 -19
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +6 -65
- transformers/tokenization_mistral_common.py +563 -903
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +228 -341
- transformers/tokenization_utils_sentencepiece.py +5 -6
- transformers/tokenization_utils_tokenizers.py +36 -7
- transformers/trainer.py +30 -41
- transformers/trainer_jit_checkpoint.py +1 -2
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +414 -420
- transformers/utils/__init__.py +1 -4
- transformers/utils/attention_visualizer.py +1 -1
- transformers/utils/auto_docstring.py +567 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/dummy_pt_objects.py +0 -42
- transformers/utils/generic.py +70 -34
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +135 -107
- transformers/utils/quantization_config.py +8 -31
- transformers/video_processing_utils.py +24 -25
- transformers/video_utils.py +21 -23
- {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/METADATA +120 -239
- transformers-5.1.0.dist-info/RECORD +2092 -0
- {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -229
- transformers-5.0.0rc2.dist-info/RECORD +0 -2042
- {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1640 @@
|
|
|
1
|
+
# Copyright 2025 the HuggingFace Team. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import math
|
|
16
|
+
from collections.abc import Callable
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
import torch.nn as nn
|
|
21
|
+
import torch.nn.functional as F
|
|
22
|
+
|
|
23
|
+
from ...cache_utils import Cache
|
|
24
|
+
from ...configuration_utils import PreTrainedConfig
|
|
25
|
+
from ...feature_extraction_utils import BatchFeature
|
|
26
|
+
from ...generation import GenerationMixin
|
|
27
|
+
from ...image_utils import ImageInput
|
|
28
|
+
from ...modeling_outputs import BaseModelOutputWithPooling
|
|
29
|
+
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
30
|
+
from ...processing_utils import ImagesKwargs, ProcessorMixin, Unpack
|
|
31
|
+
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
|
32
|
+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available, logging
|
|
33
|
+
from ...utils.generic import check_model_inputs
|
|
34
|
+
from ..chameleon.modeling_chameleon import ChameleonVQVAE, ChameleonVQVAEModelOutput, ChameleonVQVAEVectorQuantizer
|
|
35
|
+
from ..glm4v.configuration_glm4v import Glm4vTextConfig, Glm4vVisionConfig
|
|
36
|
+
from ..glm4v.modeling_glm4v import (
|
|
37
|
+
Glm4vCausalLMOutputWithPast,
|
|
38
|
+
Glm4vModel,
|
|
39
|
+
Glm4vModelOutputWithPast,
|
|
40
|
+
Glm4vPreTrainedModel,
|
|
41
|
+
Glm4vTextModel,
|
|
42
|
+
Glm4vVisionAttention,
|
|
43
|
+
Glm4vVisionBlock,
|
|
44
|
+
Glm4vVisionEmbeddings,
|
|
45
|
+
Glm4vVisionModel,
|
|
46
|
+
Glm4vVisionPatchEmbed,
|
|
47
|
+
)
|
|
48
|
+
from ..glm4v_moe.modeling_glm4v_moe import Glm4vMoeTextAttention, eager_attention_forward
|
|
49
|
+
from ..qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
|
|
50
|
+
from ..qwen2_vl.image_processing_qwen2_vl_fast import Qwen2VLImageProcessorFast
|
|
51
|
+
from ..qwen2_vl.processing_qwen2_vl import Qwen2VLProcessorKwargs
|
|
52
|
+
from ..siglip.modeling_siglip import SiglipMLP
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
if is_torch_available():
|
|
56
|
+
import torch
|
|
57
|
+
|
|
58
|
+
logger = logging.get_logger(__name__)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class GlmImageVQVAEConfig(PreTrainedConfig):
|
|
62
|
+
r"""
|
|
63
|
+
This is the configuration class to store the configuration of a [`GlmImageVQModel`]. It is used to instantiate a
|
|
64
|
+
`GlmImageVQModel` according to the specified arguments, defining the model architecture.
|
|
65
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
66
|
+
documentation from [`PreTrainedConfig`] for more information. Instantiating a
|
|
67
|
+
configuration with the defaults will yield a similar configuration to the VQModel of the
|
|
68
|
+
[zai-org/GLM-Image](https://huggingface.co/zai-org/GLM-Image) architecture.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
embed_dim (`int`, *optional*, defaults to 2048):
|
|
72
|
+
Dimensionality of each embedding vector.
|
|
73
|
+
num_embeddings (`int`, *optional*, defaults to 16384):
|
|
74
|
+
Number of codebook embeddings.
|
|
75
|
+
latent_channels (`int`, *optional*, defaults to 1536):
|
|
76
|
+
Number of channels for the latent space.
|
|
77
|
+
in_channels (`int`, *optional*, defaults to 3):
|
|
78
|
+
Number of input channels.
|
|
79
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
80
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
model_type = "glm_image_vqmodel"
|
|
84
|
+
base_config_key = "vq_config"
|
|
85
|
+
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
embed_dim: int = 2048,
|
|
89
|
+
num_embeddings: int = 16384,
|
|
90
|
+
latent_channels: int = 1536,
|
|
91
|
+
in_channels: int = 3,
|
|
92
|
+
initializer_range=0.02,
|
|
93
|
+
**kwargs,
|
|
94
|
+
):
|
|
95
|
+
super().__init__(**kwargs)
|
|
96
|
+
self.embed_dim = embed_dim
|
|
97
|
+
self.num_embeddings = num_embeddings
|
|
98
|
+
self.latent_channels = latent_channels
|
|
99
|
+
self.in_channels = in_channels
|
|
100
|
+
self.initializer_range = initializer_range
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class GlmImageVisionConfig(Glm4vVisionConfig):
|
|
104
|
+
r"""
|
|
105
|
+
This is the configuration class to store the configuration of a [`GlmImageVisionModel`]. It is used to instantiate an GlmImageVisionModel
|
|
106
|
+
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
|
|
107
|
+
a similar configuration to that of
|
|
108
|
+
GLM-Image [zai-org/GLM-Image](https://huggingface.co/zai-org/GLM-Image).
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
depth (`int`, *optional*, defaults to 40):
|
|
112
|
+
Number of layers (depth) in the model.
|
|
113
|
+
hidden_size (`int`, *optional*, defaults to 1536):
|
|
114
|
+
Dimensionality of the encoder layers and the pooler layer.
|
|
115
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
|
116
|
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
|
117
|
+
`"relu"`, `"selu"` and `"gelu_new"` are supported.
|
|
118
|
+
attention_bias (`bool`, *optional*, defaults to `True`):
|
|
119
|
+
Whether to add a bias to the queries, keys and values.
|
|
120
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
121
|
+
Dropout probability for attention weights.
|
|
122
|
+
num_heads (`int`, *optional*, defaults to 16):
|
|
123
|
+
Number of attention heads for each attention layer in the Transformer architecture.
|
|
124
|
+
in_channels (`int`, *optional*, defaults to 3):
|
|
125
|
+
Number of input channels.
|
|
126
|
+
image_size (`int` or `list[int]`, *optional*, defaults to 2048):
|
|
127
|
+
The size (resolution) of each image.
|
|
128
|
+
patch_size (`int`, *optional*, defaults to 16):
|
|
129
|
+
The size (resolution) of each patch.
|
|
130
|
+
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
|
|
131
|
+
The epsilon used by the layer normalization layers.
|
|
132
|
+
spatial_merge_size (`int`, *optional*, defaults to 1):
|
|
133
|
+
The size used for merging spatial dimensions.
|
|
134
|
+
intermediate_size (`int`, *optional*, defaults to 6144):
|
|
135
|
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
|
136
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
137
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
model_type = "glm_image_vision"
|
|
141
|
+
base_config_key = "vision_config"
|
|
142
|
+
|
|
143
|
+
def __init__(
|
|
144
|
+
self,
|
|
145
|
+
depth=40,
|
|
146
|
+
hidden_size=1536,
|
|
147
|
+
hidden_act="gelu",
|
|
148
|
+
attention_bias=True,
|
|
149
|
+
attention_dropout=0.0,
|
|
150
|
+
num_heads=16,
|
|
151
|
+
in_channels=3,
|
|
152
|
+
image_size=2048,
|
|
153
|
+
patch_size=16,
|
|
154
|
+
layer_norm_eps=1e-06,
|
|
155
|
+
spatial_merge_size=1,
|
|
156
|
+
intermediate_size=6144,
|
|
157
|
+
initializer_range=0.02,
|
|
158
|
+
**kwargs,
|
|
159
|
+
):
|
|
160
|
+
super().__init__(**kwargs)
|
|
161
|
+
del self.out_hidden_size
|
|
162
|
+
del self.rms_norm_eps
|
|
163
|
+
del self.temporal_patch_size
|
|
164
|
+
self.layer_norm_eps = layer_norm_eps
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class GlmImageTextConfig(Glm4vTextConfig):
|
|
168
|
+
r"""
|
|
169
|
+
This is the configuration class to store the configuration of a [`GlmImageTextModel`]. It is used to instantiate a
|
|
170
|
+
GLM-Image model according to the specified arguments, defining the model architecture. Instantiating a
|
|
171
|
+
configuration with the defaults will yield a similar configuration to that of
|
|
172
|
+
GLM-Image [zai-org/GLM-Image](https://huggingface.co/zai-org/GLM-Image).
|
|
173
|
+
|
|
174
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
175
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
vocab_size (`int`, *optional*, defaults to 168064):
|
|
179
|
+
Vocabulary size of the GlmImage model. Defines the number of different tokens that can be represented by
|
|
180
|
+
the `inputs_ids` passed when calling [`GlmImageModel`]
|
|
181
|
+
hidden_size (`int`, *optional*, defaults to 4096):
|
|
182
|
+
Dimension of the hidden representations.
|
|
183
|
+
intermediate_size (`int`, *optional*, defaults to 13696):
|
|
184
|
+
Dimension of the MLP representations.
|
|
185
|
+
num_hidden_layers (`int`, *optional*, defaults to 40):
|
|
186
|
+
Number of hidden layers in the Transformer encoder.
|
|
187
|
+
num_attention_heads (`int`, *optional*, defaults to 32):
|
|
188
|
+
Number of attention heads for each attention layer in the Transformer encoder.
|
|
189
|
+
num_key_value_heads (`int`, *optional*, defaults to 2):
|
|
190
|
+
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
|
191
|
+
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
|
192
|
+
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
|
193
|
+
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
|
194
|
+
by meanpooling all the original heads within that group. For more details checkout [this
|
|
195
|
+
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
|
|
196
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
|
197
|
+
The non-linear activation function (function or string) in the decoder.
|
|
198
|
+
max_position_embeddings (`int`, *optional*, defaults to 131072):
|
|
199
|
+
The maximum sequence length that this model might ever be used with.
|
|
200
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
201
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
202
|
+
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
|
203
|
+
The epsilon used by the rms normalization layers.
|
|
204
|
+
use_cache (`bool`, *optional*, defaults to `True`):
|
|
205
|
+
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
|
206
|
+
relevant if `config.is_decoder=True`.
|
|
207
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
208
|
+
The dropout ratio for the attention probabilities.
|
|
209
|
+
rope_parameters (`RopeParameters`, *optional*):
|
|
210
|
+
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
|
|
211
|
+
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
|
|
212
|
+
with longer `max_position_embeddings`.
|
|
213
|
+
pad_token_id (`int`, *optional*, defaults to 167841):
|
|
214
|
+
The id of the padding token.
|
|
215
|
+
vision_vocab_size (`int`, *optional*, defaults to 16512):
|
|
216
|
+
Vision vocabulary size of the GlmImage model. Defines the number of different tokens that can be
|
|
217
|
+
represented by the `inputs_ids` passed when calling [`GlmImageVisionModel`]
|
|
218
|
+
attention_bias (`bool`, *optional*, defaults to `True`):
|
|
219
|
+
Whether to add a bias to the queries, keys and values.
|
|
220
|
+
eos_token_id (`int`, *optional*, defaults to 16385):
|
|
221
|
+
The id of the end of sequence token.
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
>>> from transformers import GlmImageTextModel, GlmImageConfig
|
|
225
|
+
|
|
226
|
+
>>> # Initializing a GlmImageConfig style configuration
|
|
227
|
+
>>> configuration = GlmImageConfig()
|
|
228
|
+
|
|
229
|
+
>>> # Initializing a model from the GlmImageConfig style configuration
|
|
230
|
+
>>> model = GlmImageTextModel(configuration)
|
|
231
|
+
|
|
232
|
+
>>> # Accessing the model configuration
|
|
233
|
+
>>> configuration = model.config
|
|
234
|
+
```"""
|
|
235
|
+
|
|
236
|
+
def __init__(
|
|
237
|
+
self,
|
|
238
|
+
vocab_size: int = 168064,
|
|
239
|
+
max_position_embeddings: int = 131072,
|
|
240
|
+
vision_vocab_size: int = 16512,
|
|
241
|
+
attention_bias: bool = True,
|
|
242
|
+
pad_token_id: int = 167841,
|
|
243
|
+
eos_token_id: int = 16385,
|
|
244
|
+
**super_kwargs,
|
|
245
|
+
):
|
|
246
|
+
super().__init__(
|
|
247
|
+
vocab_size=vocab_size,
|
|
248
|
+
max_position_embeddings=max_position_embeddings,
|
|
249
|
+
pad_token_id=pad_token_id,
|
|
250
|
+
**super_kwargs,
|
|
251
|
+
)
|
|
252
|
+
self.vision_vocab_size = vision_vocab_size
|
|
253
|
+
self.attention_bias = attention_bias
|
|
254
|
+
self.eos_token_id = eos_token_id
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
class GlmImageConfig(PreTrainedConfig):
|
|
258
|
+
r"""
|
|
259
|
+
This is the configuration class to store the configuration of a [`GlmImageModel`]. It is used to instantiate a
|
|
260
|
+
GLM-Image model according to the specified arguments, defining the model architecture. Instantiating a
|
|
261
|
+
configuration with the defaults will yield a similar configuration to that of
|
|
262
|
+
GLM-Image [zai-org/GLM-Image](https://huggingface.co/zai-org/GLM-Image) architecture.
|
|
263
|
+
|
|
264
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
265
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `GlmImageTextConfig`):
|
|
269
|
+
The config object or dictionary of the text backbone.
|
|
270
|
+
vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `GlmImageVisionConfig`):
|
|
271
|
+
The config object or dictionary of the vision backbone.
|
|
272
|
+
vq_config (`Union[Dict, GlmImageVQVAEConfig]`, *optional*):
|
|
273
|
+
GlmImageVQVAEConfig instance containing the configuration for the VQ-VAE model.
|
|
274
|
+
image_token_id (`int`, *optional*, defaults to 167855):
|
|
275
|
+
The image token index to encode the image prompt.
|
|
276
|
+
image_start_token_id (`int`, *optional*, defaults to 16384):
|
|
277
|
+
The image start token index to encode the start of image.
|
|
278
|
+
image_end_token_id (`int`, *optional*, defaults to 16385):
|
|
279
|
+
The image end token index to encode the end of image.
|
|
280
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
281
|
+
Whether the model's input and output word embeddings should be tied.
|
|
282
|
+
|
|
283
|
+
```python
|
|
284
|
+
>>> from transformers import Glm4vForConditionalGeneration, Glm4vConfig
|
|
285
|
+
|
|
286
|
+
>>> # Initializing a GLM-Image style configuration
|
|
287
|
+
>>> configuration = Glm4vConfig()
|
|
288
|
+
|
|
289
|
+
>>> # Initializing a model from the GLM-Image style configuration
|
|
290
|
+
>>> model = Glm4vForConditionalGeneration(configuration)
|
|
291
|
+
|
|
292
|
+
>>> # Accessing the model configuration
|
|
293
|
+
>>> configuration = model.config
|
|
294
|
+
```"""
|
|
295
|
+
|
|
296
|
+
model_type = "glm_image"
|
|
297
|
+
sub_configs = {
|
|
298
|
+
"vision_config": GlmImageVisionConfig,
|
|
299
|
+
"text_config": GlmImageTextConfig,
|
|
300
|
+
"vq_config": GlmImageVQVAEConfig,
|
|
301
|
+
}
|
|
302
|
+
keys_to_ignore_at_inference = ["past_key_values"]
|
|
303
|
+
|
|
304
|
+
def __init__(
|
|
305
|
+
self,
|
|
306
|
+
text_config=None,
|
|
307
|
+
vision_config=None,
|
|
308
|
+
vq_config=None,
|
|
309
|
+
image_token_id=167855,
|
|
310
|
+
image_start_token_id=16384,
|
|
311
|
+
image_end_token_id=16385,
|
|
312
|
+
tie_word_embeddings: bool | None = False,
|
|
313
|
+
**kwargs,
|
|
314
|
+
):
|
|
315
|
+
if isinstance(vision_config, dict):
|
|
316
|
+
vision_config = self.sub_configs["vision_config"](**vision_config)
|
|
317
|
+
elif vision_config is None:
|
|
318
|
+
vision_config = self.sub_configs["vision_config"](**kwargs)
|
|
319
|
+
|
|
320
|
+
if isinstance(vq_config, dict):
|
|
321
|
+
vq_config = self.sub_configs["vq_config"](**vq_config)
|
|
322
|
+
elif vq_config is None:
|
|
323
|
+
vq_config = self.sub_configs["vq_config"](**kwargs)
|
|
324
|
+
|
|
325
|
+
if isinstance(text_config, dict):
|
|
326
|
+
text_config = self.sub_configs["text_config"](**text_config)
|
|
327
|
+
elif text_config is None:
|
|
328
|
+
text_config = self.sub_configs["text_config"](**kwargs)
|
|
329
|
+
|
|
330
|
+
self.image_token_id = image_token_id
|
|
331
|
+
self.image_start_token_id = image_start_token_id
|
|
332
|
+
self.image_end_token_id = image_end_token_id
|
|
333
|
+
self.text_config = text_config
|
|
334
|
+
self.vision_config = vision_config
|
|
335
|
+
self.vq_config = vq_config
|
|
336
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
337
|
+
super().__init__(**kwargs)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
class GlmImageVisionMLP(SiglipMLP):
|
|
341
|
+
pass
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
class GlmImageVisionAttention(Glm4vVisionAttention):
|
|
345
|
+
def __init__(self, config: GlmImageVisionConfig) -> None:
|
|
346
|
+
super().__init__(config)
|
|
347
|
+
self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.attention_bias)
|
|
348
|
+
self.proj = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias)
|
|
349
|
+
|
|
350
|
+
def forward(
|
|
351
|
+
self,
|
|
352
|
+
hidden_states: torch.Tensor,
|
|
353
|
+
cu_seqlens: torch.Tensor,
|
|
354
|
+
**kwargs,
|
|
355
|
+
) -> torch.Tensor:
|
|
356
|
+
seq_length = hidden_states.shape[0]
|
|
357
|
+
query_states, key_states, value_states = (
|
|
358
|
+
self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
|
|
359
|
+
)
|
|
360
|
+
query_states = query_states.transpose(0, 1).unsqueeze(0)
|
|
361
|
+
key_states = key_states.transpose(0, 1).unsqueeze(0)
|
|
362
|
+
value_states = value_states.transpose(0, 1).unsqueeze(0)
|
|
363
|
+
|
|
364
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
365
|
+
self.config._attn_implementation, eager_attention_forward
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
if "flash" in self.config._attn_implementation:
|
|
369
|
+
# Flash Attention: Use cu_seqlens for variable length attention
|
|
370
|
+
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
|
|
371
|
+
attn_output, _ = attention_interface(
|
|
372
|
+
self,
|
|
373
|
+
query_states,
|
|
374
|
+
key_states,
|
|
375
|
+
value_states,
|
|
376
|
+
attention_mask=None,
|
|
377
|
+
scaling=self.scaling,
|
|
378
|
+
dropout=0.0 if not self.training else self.attention_dropout,
|
|
379
|
+
cu_seq_lens_q=cu_seqlens,
|
|
380
|
+
cu_seq_lens_k=cu_seqlens,
|
|
381
|
+
max_length_q=max_seqlen,
|
|
382
|
+
max_length_k=max_seqlen,
|
|
383
|
+
is_causal=False,
|
|
384
|
+
**kwargs,
|
|
385
|
+
)
|
|
386
|
+
else:
|
|
387
|
+
# Other implementations: Process each chunk separately
|
|
388
|
+
lengths = cu_seqlens[1:] - cu_seqlens[:-1]
|
|
389
|
+
splits = [
|
|
390
|
+
torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
|
|
391
|
+
]
|
|
392
|
+
|
|
393
|
+
attn_outputs = [
|
|
394
|
+
attention_interface(
|
|
395
|
+
self,
|
|
396
|
+
q,
|
|
397
|
+
k,
|
|
398
|
+
v,
|
|
399
|
+
attention_mask=None,
|
|
400
|
+
scaling=self.scaling,
|
|
401
|
+
dropout=0.0 if not self.training else self.attention_dropout,
|
|
402
|
+
is_causal=False,
|
|
403
|
+
**kwargs,
|
|
404
|
+
)[0]
|
|
405
|
+
for q, k, v in zip(*splits)
|
|
406
|
+
]
|
|
407
|
+
attn_output = torch.cat(attn_outputs, dim=1)
|
|
408
|
+
|
|
409
|
+
attn_output = attn_output.reshape(seq_length, -1).contiguous()
|
|
410
|
+
attn_output = self.proj(attn_output)
|
|
411
|
+
return attn_output
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
class GlmImageVisionPatchEmbed(Glm4vVisionPatchEmbed):
|
|
415
|
+
def __init__(self, config: GlmImageVisionConfig) -> None:
|
|
416
|
+
super().__init__(config)
|
|
417
|
+
|
|
418
|
+
del self.temporal_patch_size
|
|
419
|
+
kernel_size = [self.patch_size, self.patch_size]
|
|
420
|
+
self.proj = nn.Conv2d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size)
|
|
421
|
+
|
|
422
|
+
def forward(self, hidden_states):
|
|
423
|
+
target_dtype = self.proj.weight.dtype
|
|
424
|
+
hidden_states = hidden_states.view(-1, self.in_channels, self.patch_size, self.patch_size)
|
|
425
|
+
hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
|
|
426
|
+
return hidden_states
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
class GlmImageVisionEmbeddings(Glm4vVisionEmbeddings):
|
|
430
|
+
def __init__(self, config: GlmImageVisionConfig) -> None:
|
|
431
|
+
super().__init__(config)
|
|
432
|
+
self.interpolated_method = "bilinear"
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
class GlmImageVisionBlock(Glm4vVisionBlock):
|
|
436
|
+
def __init__(self, config: GlmImageVisionConfig):
|
|
437
|
+
super().__init__(config)
|
|
438
|
+
self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
439
|
+
self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
440
|
+
self.attn = GlmImageVisionAttention(config)
|
|
441
|
+
self.mlp = GlmImageVisionMLP(config)
|
|
442
|
+
|
|
443
|
+
def forward(
|
|
444
|
+
self,
|
|
445
|
+
hidden_states: torch.Tensor,
|
|
446
|
+
cu_seqlens: torch.Tensor,
|
|
447
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
448
|
+
) -> torch.Tensor:
|
|
449
|
+
r"""
|
|
450
|
+
cu_seqlens (`torch.Tensor` of shape `(num_images_or_videos + 1,)`):
|
|
451
|
+
The cumulative sequence lengths of each image or video feature.
|
|
452
|
+
position_embeddings (`tuple(torch.Tensor, torch.Tensor)` of shape `(num_patches, head_dim // 2)`):
|
|
453
|
+
The cosine and sine position embeddings for vision attention.
|
|
454
|
+
"""
|
|
455
|
+
residual = hidden_states
|
|
456
|
+
|
|
457
|
+
hidden_states = self.norm1(hidden_states)
|
|
458
|
+
hidden_states = self.attn(
|
|
459
|
+
hidden_states,
|
|
460
|
+
cu_seqlens=cu_seqlens,
|
|
461
|
+
**kwargs,
|
|
462
|
+
)
|
|
463
|
+
hidden_states = residual + hidden_states
|
|
464
|
+
|
|
465
|
+
residual = hidden_states
|
|
466
|
+
hidden_states = self.norm2(hidden_states)
|
|
467
|
+
hidden_states = self.mlp(hidden_states)
|
|
468
|
+
hidden_states = residual + hidden_states
|
|
469
|
+
|
|
470
|
+
return hidden_states
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
class GlmImageTextAttention(Glm4vMoeTextAttention):
|
|
474
|
+
pass
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
class GlmImagePreTrainedModel(Glm4vPreTrainedModel):
|
|
478
|
+
config: GlmImageConfig
|
|
479
|
+
input_modalities = ("image", "text")
|
|
480
|
+
|
|
481
|
+
@torch.no_grad()
|
|
482
|
+
def _init_weights(self, module):
|
|
483
|
+
PreTrainedModel._init_weights(module)
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
class GlmImageModelOutputWithPast(Glm4vModelOutputWithPast):
|
|
487
|
+
pass
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
class GlmImageVQVAEVectorQuantizer(ChameleonVQVAEVectorQuantizer):
|
|
491
|
+
def __init__(self, config: GlmImageVQVAEConfig):
|
|
492
|
+
super().__init__(config)
|
|
493
|
+
self.num_embeddings = config.num_embeddings
|
|
494
|
+
self.embedding_dim = config.embed_dim
|
|
495
|
+
self.beta = getattr(config, "beta", 0.25)
|
|
496
|
+
|
|
497
|
+
self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
|
|
498
|
+
|
|
499
|
+
def forward(self, hidden_state: torch.Tensor):
|
|
500
|
+
hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
|
|
501
|
+
hidden_state_flattened = hidden_state.view(-1, self.embedding_dim)
|
|
502
|
+
|
|
503
|
+
# L2 normalize
|
|
504
|
+
hidden_state = F.normalize(hidden_state, p=2, dim=-1)
|
|
505
|
+
hidden_state_flattened = F.normalize(hidden_state_flattened, p=2, dim=-1)
|
|
506
|
+
embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
|
|
507
|
+
|
|
508
|
+
# distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
|
|
509
|
+
distances = (
|
|
510
|
+
torch.sum(hidden_state_flattened**2, dim=1, keepdim=True)
|
|
511
|
+
+ torch.sum(embedding**2, dim=1)
|
|
512
|
+
- 2 * torch.einsum("bd,dn->bn", hidden_state_flattened, embedding.transpose(0, 1))
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
min_encoding_indices = torch.argmin(distances, dim=1)
|
|
516
|
+
hidden_state_quant = embedding[min_encoding_indices].view(hidden_state.shape)
|
|
517
|
+
|
|
518
|
+
# compute loss for embedding
|
|
519
|
+
loss = torch.mean((hidden_state_quant.detach() - hidden_state) ** 2) + self.beta * torch.mean(
|
|
520
|
+
(hidden_state_quant - hidden_state.detach()) ** 2
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
# preserve gradients
|
|
524
|
+
hidden_state_quant = hidden_state + (hidden_state_quant - hidden_state).detach()
|
|
525
|
+
|
|
526
|
+
# reshape back to match original input shape
|
|
527
|
+
hidden_state_quant = hidden_state_quant.permute(0, 3, 1, 2).contiguous()
|
|
528
|
+
|
|
529
|
+
return hidden_state_quant, loss, min_encoding_indices
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
class GlmImageVQVAEModelOutput(ChameleonVQVAEModelOutput):
|
|
533
|
+
pass
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
class GlmImageVQVAE(ChameleonVQVAE):
|
|
537
|
+
_no_split_modules = [
|
|
538
|
+
"GlmImageVQVAEVectorQuantizer",
|
|
539
|
+
]
|
|
540
|
+
_can_record_outputs = {}
|
|
541
|
+
|
|
542
|
+
def __init__(self, config: GlmImageVQVAEConfig):
|
|
543
|
+
super().__init__(config)
|
|
544
|
+
del self.encoder
|
|
545
|
+
|
|
546
|
+
def encode(self, hidden_states):
|
|
547
|
+
conv_hidden_states = self.quant_conv(hidden_states)
|
|
548
|
+
quantized_last_hidden_state, emb_loss, indices = self.quantize(conv_hidden_states)
|
|
549
|
+
return GlmImageVQVAEModelOutput(
|
|
550
|
+
last_hidden_state=hidden_states,
|
|
551
|
+
quantized_last_hidden_state=quantized_last_hidden_state,
|
|
552
|
+
image_tokens=indices,
|
|
553
|
+
embedding_loss=emb_loss,
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
class GlmImageVisionModel(Glm4vVisionModel):
|
|
558
|
+
config: GlmImageVisionConfig
|
|
559
|
+
main_input_name = "pixel_values"
|
|
560
|
+
input_modalities = ("image",)
|
|
561
|
+
|
|
562
|
+
def __init__(self, config: GlmImageVisionConfig):
|
|
563
|
+
super().__init__(config)
|
|
564
|
+
|
|
565
|
+
head_dim = config.hidden_size // config.num_heads
|
|
566
|
+
self.head_dim = head_dim
|
|
567
|
+
|
|
568
|
+
del self.merger
|
|
569
|
+
del self.rotary_pos_emb
|
|
570
|
+
del self.post_conv_layernorm
|
|
571
|
+
del self.downsample
|
|
572
|
+
del self.post_layernorm
|
|
573
|
+
|
|
574
|
+
def rot_pos_emb(self, grid_thw):
|
|
575
|
+
pos_ids = []
|
|
576
|
+
for t, h, w in grid_thw:
|
|
577
|
+
hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
|
|
578
|
+
hpos_ids = hpos_ids.reshape(
|
|
579
|
+
h // self.spatial_merge_size,
|
|
580
|
+
self.spatial_merge_size,
|
|
581
|
+
w // self.spatial_merge_size,
|
|
582
|
+
self.spatial_merge_size,
|
|
583
|
+
)
|
|
584
|
+
hpos_ids = hpos_ids.permute(0, 2, 1, 3)
|
|
585
|
+
hpos_ids = hpos_ids.flatten()
|
|
586
|
+
|
|
587
|
+
wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
|
|
588
|
+
wpos_ids = wpos_ids.reshape(
|
|
589
|
+
h // self.spatial_merge_size,
|
|
590
|
+
self.spatial_merge_size,
|
|
591
|
+
w // self.spatial_merge_size,
|
|
592
|
+
self.spatial_merge_size,
|
|
593
|
+
)
|
|
594
|
+
wpos_ids = wpos_ids.permute(0, 2, 1, 3)
|
|
595
|
+
wpos_ids = wpos_ids.flatten()
|
|
596
|
+
pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
|
|
597
|
+
pos_ids = torch.cat(pos_ids, dim=0)
|
|
598
|
+
return pos_ids
|
|
599
|
+
|
|
600
|
+
@check_model_inputs
|
|
601
|
+
@auto_docstring
|
|
602
|
+
def forward(
|
|
603
|
+
self, pixel_values: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
|
|
604
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
605
|
+
r"""
|
|
606
|
+
pixel_values (`torch.Tensor` of shape `(total_patches, num_channels * patch_size * patch_size)`):
|
|
607
|
+
Packed pixel values.
|
|
608
|
+
grid_thw (`torch.Tensor` of shape `(num_images, 3)`):
|
|
609
|
+
The temporal, height and width of feature shape of each image.
|
|
610
|
+
|
|
611
|
+
Returns:
|
|
612
|
+
`torch.Tensor` of shape `(total_patches, hidden_size)`: Hidden states.
|
|
613
|
+
"""
|
|
614
|
+
|
|
615
|
+
hidden_states = self.patch_embed(pixel_values)
|
|
616
|
+
image_type_ids = self.rot_pos_emb(grid_thw)
|
|
617
|
+
|
|
618
|
+
cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
|
|
619
|
+
dim=0,
|
|
620
|
+
dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
|
|
621
|
+
)
|
|
622
|
+
cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
|
|
623
|
+
seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
|
|
624
|
+
hidden_states = self.embeddings(
|
|
625
|
+
hidden_states,
|
|
626
|
+
seqlens,
|
|
627
|
+
grid_thw,
|
|
628
|
+
image_type_ids[:, 0].to(hidden_states.device),
|
|
629
|
+
image_type_ids[:, 1].to(hidden_states.device),
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
# Transformer blocks (no position_embeddings needed, already added above)
|
|
633
|
+
for blk in self.blocks:
|
|
634
|
+
hidden_states = blk(
|
|
635
|
+
hidden_states,
|
|
636
|
+
cu_seqlens=cu_seqlens,
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
return BaseModelOutputWithPooling(last_hidden_state=hidden_states)
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
class GlmImageTextModel(Glm4vTextModel):
|
|
643
|
+
pass
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
class GlmImageModel(Glm4vModel):
|
|
647
|
+
def __init__(self, config):
|
|
648
|
+
super().__init__(config)
|
|
649
|
+
self.visual = GlmImageVisionModel._from_config(config.vision_config)
|
|
650
|
+
self.language_model = GlmImageTextModel._from_config(config.text_config)
|
|
651
|
+
self.vqmodel = GlmImageVQVAE._from_config(config.vq_config)
|
|
652
|
+
|
|
653
|
+
self.rope_deltas = None # cache rope_deltas here
|
|
654
|
+
|
|
655
|
+
# Per-sample caches for batch processing
|
|
656
|
+
self._cached_decode_position_ids = None # shape: [batch_size, 3, max_decode_len]
|
|
657
|
+
self._prefill_len = None # prefill sequence length (same for all samples in batch)
|
|
658
|
+
|
|
659
|
+
# Initialize weights and apply final processing
|
|
660
|
+
self.post_init()
|
|
661
|
+
|
|
662
|
+
def get_rope_index(
|
|
663
|
+
self,
|
|
664
|
+
input_ids: torch.LongTensor | None = None,
|
|
665
|
+
image_grid_thw: torch.LongTensor | None = None,
|
|
666
|
+
images_per_sample: torch.LongTensor | None = None,
|
|
667
|
+
attention_mask: torch.LongTensor | None = None,
|
|
668
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
|
669
|
+
"""
|
|
670
|
+
Calculate the 3D rope index for image generation task with full batch support.
|
|
671
|
+
|
|
672
|
+
Args:
|
|
673
|
+
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
674
|
+
Indices of input sequence tokens in the vocabulary.
|
|
675
|
+
image_grid_thw (`torch.LongTensor` of shape `(total_images_in_batch, 3)`, *optional*):
|
|
676
|
+
The temporal, height and width of feature shape of each image.
|
|
677
|
+
Images are packed across all samples in the batch.
|
|
678
|
+
images_per_sample (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
679
|
+
Number of images (including target grids) for each sample in the batch.
|
|
680
|
+
Used to split image_grid_thw by sample.
|
|
681
|
+
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
682
|
+
Mask to avoid performing attention on padding token indices.
|
|
683
|
+
|
|
684
|
+
Returns:
|
|
685
|
+
position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`):
|
|
686
|
+
Position IDs for temporal, height, and width dimensions.
|
|
687
|
+
mrope_position_deltas (`torch.Tensor` of shape `(batch_size, 1)`):
|
|
688
|
+
Position deltas for multi-modal rotary position embedding.
|
|
689
|
+
"""
|
|
690
|
+
batch_size, seq_len = input_ids.shape
|
|
691
|
+
device = input_ids.device
|
|
692
|
+
dtype = input_ids.dtype
|
|
693
|
+
|
|
694
|
+
image_start_token_id = self.config.image_start_token_id
|
|
695
|
+
image_end_token_id = self.config.image_end_token_id
|
|
696
|
+
|
|
697
|
+
position_ids = torch.ones(3, batch_size, seq_len, dtype=dtype, device=device)
|
|
698
|
+
text_positions = torch.arange(seq_len, device=device)[None, :].repeat(3, 1)
|
|
699
|
+
|
|
700
|
+
# Split image_grid_thw by sample if images_per_sample is provided
|
|
701
|
+
if image_grid_thw is not None and images_per_sample is not None:
|
|
702
|
+
grids_per_sample = torch.split(image_grid_thw, images_per_sample.tolist())
|
|
703
|
+
elif image_grid_thw is not None:
|
|
704
|
+
# Fallback: assume all grids belong to first sample (batch_size=1)
|
|
705
|
+
grids_per_sample = [image_grid_thw] * batch_size
|
|
706
|
+
else:
|
|
707
|
+
grids_per_sample = [None] * batch_size
|
|
708
|
+
|
|
709
|
+
# Per-sample caches for decode stage
|
|
710
|
+
all_decode_position_ids = []
|
|
711
|
+
|
|
712
|
+
for batch_idx in range(batch_size):
|
|
713
|
+
curr_input_ids = input_ids[batch_idx]
|
|
714
|
+
curr_grids = grids_per_sample[batch_idx]
|
|
715
|
+
|
|
716
|
+
if attention_mask is not None and attention_mask.shape[1] == seq_len:
|
|
717
|
+
valid_mask = attention_mask[batch_idx] == 1
|
|
718
|
+
curr_input_ids_valid = curr_input_ids[valid_mask]
|
|
719
|
+
else:
|
|
720
|
+
# attention_mask may have different length during assisted decoding
|
|
721
|
+
curr_input_ids_valid = curr_input_ids
|
|
722
|
+
valid_mask = None
|
|
723
|
+
|
|
724
|
+
# Find image boundaries in this sample
|
|
725
|
+
image_end_positions = torch.where(curr_input_ids_valid == image_end_token_id)[0]
|
|
726
|
+
image_start_positions = torch.where(curr_input_ids_valid == image_start_token_id)[0] + 1
|
|
727
|
+
num_complete_images = len(image_end_positions)
|
|
728
|
+
|
|
729
|
+
current_pos = 0
|
|
730
|
+
prev_image_end = 0
|
|
731
|
+
curr_position_ids = []
|
|
732
|
+
|
|
733
|
+
# Process complete images (source images in image-to-image task)
|
|
734
|
+
for img_idx, (start, end) in enumerate(zip(image_start_positions, image_end_positions)):
|
|
735
|
+
if curr_grids is None or img_idx >= len(curr_grids):
|
|
736
|
+
break
|
|
737
|
+
grid = curr_grids[img_idx]
|
|
738
|
+
# grid format is [temporal, height, width]
|
|
739
|
+
_, height, width = grid.tolist()
|
|
740
|
+
|
|
741
|
+
# Text tokens before this image
|
|
742
|
+
llm_pos_length = start - prev_image_end
|
|
743
|
+
llm_position_ids = text_positions[:, current_pos : current_pos + llm_pos_length].to(device=device)
|
|
744
|
+
current_pos += llm_position_ids.shape[-1]
|
|
745
|
+
|
|
746
|
+
# Image tokens with 2D spatial encoding
|
|
747
|
+
# For an image with height H and width W:
|
|
748
|
+
# - position_width cycles [0, 1, ..., W-1] for each row, repeated H times
|
|
749
|
+
# - position_height stays constant per row, [0]*W, [1]*W, ..., [H-1]*W
|
|
750
|
+
image_seq_length = height * width
|
|
751
|
+
position_width = torch.arange(current_pos, current_pos + width, device=device).repeat(height)
|
|
752
|
+
position_height = torch.arange(current_pos, current_pos + height, device=device).repeat_interleave(
|
|
753
|
+
width
|
|
754
|
+
)
|
|
755
|
+
position_temporal = torch.full((image_seq_length,), current_pos, device=device, dtype=torch.long)
|
|
756
|
+
vision_position_ids = torch.stack([position_temporal, position_height, position_width], dim=0)
|
|
757
|
+
current_pos += max(height, width)
|
|
758
|
+
|
|
759
|
+
prev_image_end = end
|
|
760
|
+
curr_position_ids.append(torch.cat([llm_position_ids, vision_position_ids], dim=-1))
|
|
761
|
+
|
|
762
|
+
# Remaining text tokens (including the final image_start token for generation)
|
|
763
|
+
end_position = len(curr_input_ids_valid) - prev_image_end
|
|
764
|
+
llm_position_ids = text_positions[:, current_pos : current_pos + end_position].to(device=device)
|
|
765
|
+
current_pos += llm_position_ids.shape[-1]
|
|
766
|
+
curr_position_ids.append(llm_position_ids)
|
|
767
|
+
|
|
768
|
+
# Concatenate all position ids for this sample
|
|
769
|
+
curr_position_ids = torch.cat(curr_position_ids, dim=-1)
|
|
770
|
+
|
|
771
|
+
# Store in the main position_ids tensor
|
|
772
|
+
if valid_mask is not None:
|
|
773
|
+
position_ids[:, batch_idx, valid_mask] = curr_position_ids
|
|
774
|
+
else:
|
|
775
|
+
position_ids[:, batch_idx, :] = curr_position_ids
|
|
776
|
+
|
|
777
|
+
# Build decode position ids for this sample
|
|
778
|
+
if curr_grids is not None and len(curr_grids) > 0:
|
|
779
|
+
num_decode_grids = len(curr_grids) - num_complete_images
|
|
780
|
+
num_decode_grids = max(num_decode_grids, 0)
|
|
781
|
+
decode_pos = current_pos
|
|
782
|
+
|
|
783
|
+
decode_temporal_list = []
|
|
784
|
+
decode_height_list = []
|
|
785
|
+
decode_width_list = []
|
|
786
|
+
|
|
787
|
+
for i in range(1, num_decode_grids + 1):
|
|
788
|
+
grid_idx = -i
|
|
789
|
+
h = curr_grids[grid_idx, 1].item()
|
|
790
|
+
w = curr_grids[grid_idx, 2].item()
|
|
791
|
+
total_tokens = h * w
|
|
792
|
+
|
|
793
|
+
h_indices = torch.arange(h, device=device).unsqueeze(1).expand(h, w).flatten()
|
|
794
|
+
w_indices = torch.arange(w, device=device).unsqueeze(0).expand(h, w).flatten()
|
|
795
|
+
|
|
796
|
+
decode_temporal_list.append(
|
|
797
|
+
torch.full((total_tokens,), decode_pos, device=device, dtype=torch.long)
|
|
798
|
+
)
|
|
799
|
+
decode_height_list.append(decode_pos + h_indices)
|
|
800
|
+
decode_width_list.append(decode_pos + w_indices)
|
|
801
|
+
decode_pos = decode_pos + max(h, w)
|
|
802
|
+
|
|
803
|
+
# End marker
|
|
804
|
+
decode_temporal_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
|
|
805
|
+
decode_height_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
|
|
806
|
+
decode_width_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
|
|
807
|
+
|
|
808
|
+
sample_decode_pos_ids = torch.stack(
|
|
809
|
+
[
|
|
810
|
+
torch.cat(decode_temporal_list, dim=0),
|
|
811
|
+
torch.cat(decode_height_list, dim=0),
|
|
812
|
+
torch.cat(decode_width_list, dim=0),
|
|
813
|
+
],
|
|
814
|
+
dim=0,
|
|
815
|
+
)
|
|
816
|
+
all_decode_position_ids.append(sample_decode_pos_ids)
|
|
817
|
+
|
|
818
|
+
# Store prefill length (same for all samples since input_ids is padded to same length)
|
|
819
|
+
self._prefill_len = seq_len
|
|
820
|
+
|
|
821
|
+
# Pad decode position ids to same length and stack
|
|
822
|
+
if all_decode_position_ids:
|
|
823
|
+
max_decode_len = max(x.shape[1] for x in all_decode_position_ids)
|
|
824
|
+
padded_decode_pos_ids = [
|
|
825
|
+
F.pad(pos_ids, (0, max_decode_len - pos_ids.shape[1]), mode="replicate")
|
|
826
|
+
for pos_ids in all_decode_position_ids
|
|
827
|
+
]
|
|
828
|
+
self._cached_decode_position_ids = torch.stack(padded_decode_pos_ids, dim=0) # [batch, 3, max_decode_len]
|
|
829
|
+
else:
|
|
830
|
+
self._cached_decode_position_ids = None
|
|
831
|
+
|
|
832
|
+
mrope_position_deltas = torch.zeros([batch_size, 1], dtype=dtype, device=device)
|
|
833
|
+
|
|
834
|
+
return position_ids, mrope_position_deltas
|
|
835
|
+
|
|
836
|
+
def get_image_tokens(
|
|
837
|
+
self,
|
|
838
|
+
hidden_states: torch.FloatTensor,
|
|
839
|
+
image_grid_thw: torch.LongTensor,
|
|
840
|
+
) -> torch.LongTensor:
|
|
841
|
+
"""
|
|
842
|
+
Tokenizes image features into discrete tokens with VQVAE module.
|
|
843
|
+
|
|
844
|
+
Args:
|
|
845
|
+
hidden_states (`torch.FloatTensor` of shape `(total_patches, hidden_size)`):
|
|
846
|
+
The packed image features from vision encoder.
|
|
847
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`):
|
|
848
|
+
The temporal, height and width of feature shape of each image.
|
|
849
|
+
|
|
850
|
+
Returns:
|
|
851
|
+
image_tokens (`torch.LongTensor` of shape `(total_patches,)`):
|
|
852
|
+
Discrete token indices from the VQVAE codebook.
|
|
853
|
+
"""
|
|
854
|
+
hidden_size = hidden_states.shape[-1]
|
|
855
|
+
split_sizes = (image_grid_thw.prod(dim=-1)).tolist()
|
|
856
|
+
hidden_states_list = torch.split(hidden_states, split_sizes, dim=0)
|
|
857
|
+
|
|
858
|
+
all_image_toks = []
|
|
859
|
+
for i, hs in enumerate(hidden_states_list):
|
|
860
|
+
grid_t, grid_h, grid_w = image_grid_thw[i].tolist()
|
|
861
|
+
hs = hs.view(grid_t, grid_h, grid_w, hidden_size)
|
|
862
|
+
hs = hs.permute(0, 3, 1, 2).contiguous()
|
|
863
|
+
vqmodel_outputs: GlmImageVQVAEModelOutput = self.vqmodel.encode(hs)
|
|
864
|
+
all_image_toks.append(vqmodel_outputs.image_tokens)
|
|
865
|
+
return torch.cat(all_image_toks, dim=0)
|
|
866
|
+
|
|
867
|
+
def get_video_features(self):
|
|
868
|
+
raise AttributeError("Not needed for GlmImage")
|
|
869
|
+
|
|
870
|
+
@can_return_tuple
|
|
871
|
+
@auto_docstring
|
|
872
|
+
def get_image_features(
|
|
873
|
+
self,
|
|
874
|
+
pixel_values: torch.FloatTensor,
|
|
875
|
+
image_grid_thw: torch.LongTensor | None = None,
|
|
876
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
877
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
878
|
+
r"""
|
|
879
|
+
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
|
880
|
+
The tensors corresponding to the input images.
|
|
881
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
882
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
883
|
+
"""
|
|
884
|
+
pixel_values = pixel_values.type(self.visual.dtype)
|
|
885
|
+
vision_outputs = self.visual(pixel_values, grid_thw=image_grid_thw, return_dict=True, **kwargs)
|
|
886
|
+
split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
|
|
887
|
+
image_embeds = torch.split(vision_outputs.last_hidden_state, split_sizes)
|
|
888
|
+
vision_outputs.pooler_output = image_embeds
|
|
889
|
+
|
|
890
|
+
return vision_outputs
|
|
891
|
+
|
|
892
|
+
def get_placeholder_mask(
|
|
893
|
+
self,
|
|
894
|
+
input_ids: torch.LongTensor,
|
|
895
|
+
image_ids: torch.LongTensor,
|
|
896
|
+
):
|
|
897
|
+
"""
|
|
898
|
+
Replace image placeholder tokens in input_ids with actual image token ids from VQVAE.
|
|
899
|
+
|
|
900
|
+
Args:
|
|
901
|
+
input_ids (`torch.LongTensor` of shape `(batch_size, seq_len)`):
|
|
902
|
+
Input token ids with image placeholders.
|
|
903
|
+
image_ids (`torch.LongTensor` of shape `(num_images, num_tokens_per_image)` or flattened):
|
|
904
|
+
Discrete token indices from the VQVAE codebook.
|
|
905
|
+
|
|
906
|
+
Returns:
|
|
907
|
+
special_image_mask (`torch.LongTensor` of shape `(batch_size, seq_len)`):
|
|
908
|
+
Mask indicating positions in input ids that will be replaced by actual image tokens.
|
|
909
|
+
"""
|
|
910
|
+
|
|
911
|
+
special_image_mask = input_ids == self.config.image_token_id
|
|
912
|
+
n_placeholder_tokens = special_image_mask.sum().item()
|
|
913
|
+
n_image_tokens = image_ids.shape[0]
|
|
914
|
+
|
|
915
|
+
if n_placeholder_tokens != n_image_tokens:
|
|
916
|
+
raise ValueError(
|
|
917
|
+
f"Number of image placeholder tokens ({n_placeholder_tokens}) does not match "
|
|
918
|
+
f"number of image tokens from VQVAE ({n_image_tokens})"
|
|
919
|
+
)
|
|
920
|
+
|
|
921
|
+
return special_image_mask
|
|
922
|
+
|
|
923
|
+
def forward(
|
|
924
|
+
self,
|
|
925
|
+
input_ids: torch.LongTensor | None = None,
|
|
926
|
+
attention_mask: torch.Tensor | None = None,
|
|
927
|
+
position_ids: torch.LongTensor | None = None,
|
|
928
|
+
past_key_values: Cache | None = None,
|
|
929
|
+
inputs_embeds: torch.FloatTensor | None = None,
|
|
930
|
+
pixel_values: torch.Tensor | None = None,
|
|
931
|
+
image_grid_thw: torch.LongTensor | None = None,
|
|
932
|
+
images_per_sample: torch.LongTensor | None = None,
|
|
933
|
+
rope_deltas: torch.LongTensor | None = None,
|
|
934
|
+
cache_position: torch.LongTensor | None = None,
|
|
935
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
936
|
+
) -> tuple | GlmImageModelOutputWithPast:
|
|
937
|
+
r"""
|
|
938
|
+
image_grid_thw (`torch.LongTensor` of shape `(total_images_in_batch, 3)`, *optional*):
|
|
939
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
940
|
+
Images are packed across all samples in the batch.
|
|
941
|
+
images_per_sample (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
942
|
+
Number of images (including target grids) for each sample in the batch.
|
|
943
|
+
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
|
|
944
|
+
The rope index difference between sequence length and multimodal rope.
|
|
945
|
+
"""
|
|
946
|
+
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
947
|
+
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
|
948
|
+
|
|
949
|
+
batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
|
|
950
|
+
|
|
951
|
+
if pixel_values is not None:
|
|
952
|
+
# Process source images (image-to-image mode)
|
|
953
|
+
# Source images are identified by counting image_end_token_id in input_ids
|
|
954
|
+
# Note: We must exclude padding tokens since pad_token_id == image_end_token_id
|
|
955
|
+
if images_per_sample is not None:
|
|
956
|
+
grids_per_sample = torch.split(image_grid_thw, images_per_sample.tolist())
|
|
957
|
+
# Create mask for non-padding tokens (attention_mask=1 means non-padding)
|
|
958
|
+
# Handle 4D attention mask (from static cache) by extracting diagonal
|
|
959
|
+
if attention_mask is not None and attention_mask.ndim == 4:
|
|
960
|
+
non_pad_mask = torch.diagonal(attention_mask[:, 0], dim1=1, dim2=2)
|
|
961
|
+
if non_pad_mask.dtype.is_floating_point:
|
|
962
|
+
non_pad_mask = non_pad_mask / torch.finfo(non_pad_mask.dtype).min
|
|
963
|
+
non_pad_mask = (1.0 - non_pad_mask).int()
|
|
964
|
+
# Only keep columns matching input_ids length
|
|
965
|
+
non_pad_mask = non_pad_mask[:, -input_ids.shape[1] :]
|
|
966
|
+
else:
|
|
967
|
+
non_pad_mask = attention_mask if attention_mask is not None else torch.ones_like(input_ids)
|
|
968
|
+
|
|
969
|
+
source_grids_list = []
|
|
970
|
+
for sample_idx in range(batch_size):
|
|
971
|
+
is_image_end = input_ids[sample_idx] == self.config.image_end_token_id
|
|
972
|
+
is_non_pad = non_pad_mask[sample_idx] == 1
|
|
973
|
+
num_source = (is_image_end & is_non_pad).sum().item()
|
|
974
|
+
if num_source > 0:
|
|
975
|
+
source_grids_list.append(grids_per_sample[sample_idx][:num_source])
|
|
976
|
+
if len(source_grids_list) == 0:
|
|
977
|
+
raise ValueError(
|
|
978
|
+
"pixel_values provided but no source images found in input_ids. "
|
|
979
|
+
"Ensure input_ids contains image_end_token_id for each source image."
|
|
980
|
+
)
|
|
981
|
+
source_grids = torch.cat(source_grids_list, dim=0)
|
|
982
|
+
else:
|
|
983
|
+
# Fallback for batch_size=1: all but last grid are source images
|
|
984
|
+
source_grids = image_grid_thw[:-1]
|
|
985
|
+
|
|
986
|
+
image_features = self.get_image_features(pixel_values, source_grids, return_dict=True)
|
|
987
|
+
image_embeds = torch.cat(image_features.pooler_output, dim=0)
|
|
988
|
+
image_ids = self.get_image_tokens(image_embeds, source_grids)
|
|
989
|
+
image_ids = image_ids.view(-1).to(input_ids.device)
|
|
990
|
+
special_image_mask = self.get_placeholder_mask(input_ids, image_ids)
|
|
991
|
+
input_ids = input_ids.masked_scatter(special_image_mask, image_ids)
|
|
992
|
+
|
|
993
|
+
if inputs_embeds is None:
|
|
994
|
+
inputs_embeds = self.get_input_embeddings()(input_ids)
|
|
995
|
+
|
|
996
|
+
if position_ids is None:
|
|
997
|
+
attention_mask_2d = attention_mask
|
|
998
|
+
if attention_mask is not None and attention_mask.ndim == 4:
|
|
999
|
+
attention_mask_2d = torch.diagonal(attention_mask[:, 0], dim1=1, dim2=2)
|
|
1000
|
+
# Only apply conversion for floating point tensors (inverted masks)
|
|
1001
|
+
if attention_mask_2d.dtype.is_floating_point:
|
|
1002
|
+
attention_mask_2d = attention_mask_2d / torch.finfo(attention_mask_2d.dtype).min
|
|
1003
|
+
attention_mask_2d = (1.0 - attention_mask_2d).int()
|
|
1004
|
+
|
|
1005
|
+
# Calculate RoPE index once per generation in the pre-fill stage only.
|
|
1006
|
+
is_prefill_stage = (input_ids is not None and input_ids.shape[1] != 1) or (
|
|
1007
|
+
inputs_embeds is not None and inputs_embeds.shape[1] != 1
|
|
1008
|
+
)
|
|
1009
|
+
if is_prefill_stage or self.rope_deltas is None:
|
|
1010
|
+
position_ids, rope_deltas = self.get_rope_index(
|
|
1011
|
+
input_ids,
|
|
1012
|
+
image_grid_thw,
|
|
1013
|
+
images_per_sample=images_per_sample,
|
|
1014
|
+
attention_mask=attention_mask_2d,
|
|
1015
|
+
)
|
|
1016
|
+
self.rope_deltas = rope_deltas
|
|
1017
|
+
# then use the prev pre-calculated rope-deltas to get the correct position ids
|
|
1018
|
+
else:
|
|
1019
|
+
batch_size, seq_length, _ = inputs_embeds.shape
|
|
1020
|
+
# Per-sample decode position lookup
|
|
1021
|
+
# _cached_decode_position_ids shape: [batch_size, 3, max_decode_len]
|
|
1022
|
+
if self._cached_decode_position_ids is not None:
|
|
1023
|
+
step = cache_position[0].item() - self._prefill_len
|
|
1024
|
+
# Get position ids for all samples at once, then transpose to [3, batch_size, seq_length]
|
|
1025
|
+
position_ids = self._cached_decode_position_ids[:, :, step : step + seq_length].permute(1, 0, 2)
|
|
1026
|
+
else:
|
|
1027
|
+
# Fallback for text-to-image or cases without cached decode positions
|
|
1028
|
+
# Use simple incremental positions
|
|
1029
|
+
start_pos = cache_position[0].item()
|
|
1030
|
+
position_ids = torch.arange(
|
|
1031
|
+
start_pos, start_pos + seq_length, device=inputs_embeds.device, dtype=torch.long
|
|
1032
|
+
)
|
|
1033
|
+
position_ids = position_ids.unsqueeze(0).repeat(3, batch_size, 1)
|
|
1034
|
+
|
|
1035
|
+
outputs = self.language_model(
|
|
1036
|
+
input_ids=None,
|
|
1037
|
+
position_ids=position_ids,
|
|
1038
|
+
attention_mask=attention_mask,
|
|
1039
|
+
past_key_values=past_key_values,
|
|
1040
|
+
inputs_embeds=inputs_embeds,
|
|
1041
|
+
cache_position=cache_position,
|
|
1042
|
+
**kwargs,
|
|
1043
|
+
)
|
|
1044
|
+
|
|
1045
|
+
return GlmImageModelOutputWithPast(
|
|
1046
|
+
last_hidden_state=outputs.last_hidden_state,
|
|
1047
|
+
past_key_values=outputs.past_key_values,
|
|
1048
|
+
hidden_states=outputs.hidden_states,
|
|
1049
|
+
attentions=outputs.attentions,
|
|
1050
|
+
rope_deltas=self.rope_deltas,
|
|
1051
|
+
)
|
|
1052
|
+
|
|
1053
|
+
|
|
1054
|
+
class GlmImageCausalLMOutputWithPast(Glm4vCausalLMOutputWithPast):
|
|
1055
|
+
pass
|
|
1056
|
+
|
|
1057
|
+
|
|
1058
|
+
class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin):
|
|
1059
|
+
_checkpoint_conversion_mapping = {}
|
|
1060
|
+
_tied_weights_keys = {}
|
|
1061
|
+
# Reference: fix gemma3 grad acc #37208
|
|
1062
|
+
accepts_loss_kwargs = False
|
|
1063
|
+
base_model_prefix = "model"
|
|
1064
|
+
config: GlmImageConfig
|
|
1065
|
+
|
|
1066
|
+
def __init__(self, config):
|
|
1067
|
+
super().__init__(config)
|
|
1068
|
+
self.model = GlmImageModel(config)
|
|
1069
|
+
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vision_vocab_size, bias=False)
|
|
1070
|
+
|
|
1071
|
+
# Initialize weights and apply final processing
|
|
1072
|
+
self.post_init()
|
|
1073
|
+
|
|
1074
|
+
@auto_docstring
|
|
1075
|
+
def get_image_features(
|
|
1076
|
+
self,
|
|
1077
|
+
pixel_values: torch.FloatTensor,
|
|
1078
|
+
image_grid_thw: torch.LongTensor | None = None,
|
|
1079
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
1080
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
1081
|
+
r"""
|
|
1082
|
+
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
|
1083
|
+
The tensors corresponding to the input images.
|
|
1084
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
1085
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
1086
|
+
"""
|
|
1087
|
+
return self.model.get_image_features(pixel_values, image_grid_thw, **kwargs)
|
|
1088
|
+
|
|
1089
|
+
def get_image_tokens(self, hidden_states: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None):
|
|
1090
|
+
return self.model.get_image_tokens(hidden_states, image_grid_thw)
|
|
1091
|
+
|
|
1092
|
+
def forward(
|
|
1093
|
+
self,
|
|
1094
|
+
input_ids: torch.LongTensor | None = None,
|
|
1095
|
+
attention_mask: torch.Tensor | None = None,
|
|
1096
|
+
position_ids: torch.LongTensor | None = None,
|
|
1097
|
+
past_key_values: Cache | None = None,
|
|
1098
|
+
inputs_embeds: torch.FloatTensor | None = None,
|
|
1099
|
+
labels: torch.LongTensor | None = None,
|
|
1100
|
+
pixel_values: torch.Tensor | None = None,
|
|
1101
|
+
image_grid_thw: torch.LongTensor | None = None,
|
|
1102
|
+
images_per_sample: torch.LongTensor | None = None,
|
|
1103
|
+
cache_position: torch.LongTensor | None = None,
|
|
1104
|
+
logits_to_keep: int | torch.Tensor = 0,
|
|
1105
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
1106
|
+
) -> tuple | GlmImageCausalLMOutputWithPast:
|
|
1107
|
+
r"""
|
|
1108
|
+
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
1109
|
+
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
|
1110
|
+
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
|
1111
|
+
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
|
1112
|
+
image_grid_thw (`torch.LongTensor` of shape `(total_images_in_batch, 3)`, *optional*):
|
|
1113
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
1114
|
+
Images are packed across all samples in the batch.
|
|
1115
|
+
images_per_sample (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
1116
|
+
Number of images (including target grids) for each sample in the batch.
|
|
1117
|
+
|
|
1118
|
+
Example:
|
|
1119
|
+
|
|
1120
|
+
```python
|
|
1121
|
+
>>> from PIL import Image
|
|
1122
|
+
>>> import httpx
|
|
1123
|
+
>>> from io import BytesIO
|
|
1124
|
+
>>> from transformers import AutoProcessor, GlmImageForConditionalGeneration
|
|
1125
|
+
|
|
1126
|
+
>>> model = GlmImageForConditionalGeneration.from_pretrained("zai-org/GLM-Image")
|
|
1127
|
+
>>> processor = AutoProcessor.from_pretrained("zai-org/GLM-Image")
|
|
1128
|
+
|
|
1129
|
+
>>> messages = [
|
|
1130
|
+
{
|
|
1131
|
+
"role": "user",
|
|
1132
|
+
"content": [
|
|
1133
|
+
{"type": "image"},
|
|
1134
|
+
{"type": "text", "text": "Add a truck of this photo.<sop>28 40<eop>"},
|
|
1135
|
+
],
|
|
1136
|
+
},
|
|
1137
|
+
]
|
|
1138
|
+
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
|
|
1139
|
+
>>> with httpx.stream("GET", url) as response:
|
|
1140
|
+
... image = Image.open(BytesIO(response.read()))
|
|
1141
|
+
|
|
1142
|
+
>>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
1143
|
+
>>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
|
|
1144
|
+
|
|
1145
|
+
>>> # Generate
|
|
1146
|
+
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
|
1147
|
+
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
1148
|
+
"The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
|
|
1149
|
+
```"""
|
|
1150
|
+
outputs = self.model(
|
|
1151
|
+
input_ids=input_ids,
|
|
1152
|
+
pixel_values=pixel_values,
|
|
1153
|
+
image_grid_thw=image_grid_thw,
|
|
1154
|
+
images_per_sample=images_per_sample,
|
|
1155
|
+
position_ids=position_ids,
|
|
1156
|
+
attention_mask=attention_mask,
|
|
1157
|
+
past_key_values=past_key_values,
|
|
1158
|
+
inputs_embeds=inputs_embeds,
|
|
1159
|
+
cache_position=cache_position,
|
|
1160
|
+
**kwargs,
|
|
1161
|
+
)
|
|
1162
|
+
|
|
1163
|
+
hidden_states = outputs[0]
|
|
1164
|
+
|
|
1165
|
+
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
|
1166
|
+
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
|
|
1167
|
+
logits = self.lm_head(hidden_states[:, slice_indices, :])
|
|
1168
|
+
|
|
1169
|
+
loss = None
|
|
1170
|
+
if labels is not None:
|
|
1171
|
+
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
|
|
1172
|
+
|
|
1173
|
+
return GlmImageCausalLMOutputWithPast(
|
|
1174
|
+
loss=loss,
|
|
1175
|
+
logits=logits,
|
|
1176
|
+
past_key_values=outputs.past_key_values,
|
|
1177
|
+
hidden_states=outputs.hidden_states,
|
|
1178
|
+
attentions=outputs.attentions,
|
|
1179
|
+
rope_deltas=outputs.rope_deltas,
|
|
1180
|
+
)
|
|
1181
|
+
|
|
1182
|
+
def prepare_inputs_for_generation(
|
|
1183
|
+
self,
|
|
1184
|
+
input_ids,
|
|
1185
|
+
past_key_values=None,
|
|
1186
|
+
attention_mask=None,
|
|
1187
|
+
inputs_embeds=None,
|
|
1188
|
+
cache_position=None,
|
|
1189
|
+
position_ids=None,
|
|
1190
|
+
use_cache=True,
|
|
1191
|
+
pixel_values=None,
|
|
1192
|
+
image_grid_thw=None,
|
|
1193
|
+
images_per_sample=None,
|
|
1194
|
+
is_first_iteration=False,
|
|
1195
|
+
**kwargs,
|
|
1196
|
+
):
|
|
1197
|
+
model_inputs = super().prepare_inputs_for_generation(
|
|
1198
|
+
input_ids,
|
|
1199
|
+
past_key_values=past_key_values,
|
|
1200
|
+
attention_mask=attention_mask,
|
|
1201
|
+
inputs_embeds=inputs_embeds,
|
|
1202
|
+
cache_position=cache_position,
|
|
1203
|
+
position_ids=position_ids,
|
|
1204
|
+
pixel_values=pixel_values,
|
|
1205
|
+
image_grid_thw=image_grid_thw,
|
|
1206
|
+
is_first_iteration=is_first_iteration,
|
|
1207
|
+
use_cache=use_cache,
|
|
1208
|
+
**kwargs,
|
|
1209
|
+
)
|
|
1210
|
+
|
|
1211
|
+
model_inputs["position_ids"] = None
|
|
1212
|
+
model_inputs["images_per_sample"] = images_per_sample
|
|
1213
|
+
|
|
1214
|
+
if not is_first_iteration and use_cache:
|
|
1215
|
+
model_inputs["pixel_values"] = None
|
|
1216
|
+
|
|
1217
|
+
return model_inputs
|
|
1218
|
+
|
|
1219
|
+
def _get_image_nums(
|
|
1220
|
+
self,
|
|
1221
|
+
input_ids: torch.LongTensor | None,
|
|
1222
|
+
) -> torch.Tensor:
|
|
1223
|
+
"""
|
|
1224
|
+
Get the number of images for each sample.
|
|
1225
|
+
For GLM-Image, only input_ids allow us to get the number of images.
|
|
1226
|
+
|
|
1227
|
+
Returns:
|
|
1228
|
+
image_counts (`torch.LongTensor` of shape `(batch_size,)`)
|
|
1229
|
+
"""
|
|
1230
|
+
is_image = input_ids == self.config.image_start_token_id
|
|
1231
|
+
|
|
1232
|
+
return is_image.sum(dim=1)
|
|
1233
|
+
|
|
1234
|
+
def _expand_inputs_for_generation(
|
|
1235
|
+
self,
|
|
1236
|
+
expand_size: int = 1,
|
|
1237
|
+
is_encoder_decoder: bool = False,
|
|
1238
|
+
input_ids: torch.LongTensor | None = None,
|
|
1239
|
+
**model_kwargs,
|
|
1240
|
+
) -> tuple[torch.LongTensor, dict[str, Any]]:
|
|
1241
|
+
# Overwritten -- Support for expanding tensors without a batch size dimension
|
|
1242
|
+
# e.g., pixel_values, image_grid_thw
|
|
1243
|
+
# pixel_values.shape[0] is sum(seqlen_images for samples)
|
|
1244
|
+
# image_grid_thw.shape[0] is sum(num_images for samples)
|
|
1245
|
+
|
|
1246
|
+
if expand_size == 1:
|
|
1247
|
+
return input_ids, model_kwargs
|
|
1248
|
+
|
|
1249
|
+
visual_keys = ["pixel_values", "image_grid_thw", "images_per_sample"]
|
|
1250
|
+
|
|
1251
|
+
def _expand_dict_for_generation_visual(dict_to_expand):
|
|
1252
|
+
image_grid_thw = model_kwargs.get("image_grid_thw", None)
|
|
1253
|
+
if image_grid_thw is None:
|
|
1254
|
+
return dict_to_expand
|
|
1255
|
+
|
|
1256
|
+
images_per_sample = model_kwargs.get("images_per_sample", None)
|
|
1257
|
+
|
|
1258
|
+
# Use images_per_sample if available
|
|
1259
|
+
if images_per_sample is not None:
|
|
1260
|
+
image_nums = images_per_sample.tolist()
|
|
1261
|
+
elif input_ids is not None:
|
|
1262
|
+
# Try to infer from image_grid_thw / batch_size
|
|
1263
|
+
batch_size = input_ids.shape[0]
|
|
1264
|
+
total_grids = image_grid_thw.shape[0]
|
|
1265
|
+
if total_grids % batch_size == 0:
|
|
1266
|
+
grids_per_sample = total_grids // batch_size
|
|
1267
|
+
image_nums = [grids_per_sample] * batch_size
|
|
1268
|
+
else:
|
|
1269
|
+
# Cannot evenly distribute grids - fall back to simple repeat_interleave
|
|
1270
|
+
# This handles test cases where image_grid_thw has (batch_size + 1) rows
|
|
1271
|
+
dict_to_expand["image_grid_thw"] = image_grid_thw.repeat_interleave(expand_size, dim=0)
|
|
1272
|
+
if dict_to_expand.get("pixel_values") is not None:
|
|
1273
|
+
dict_to_expand["pixel_values"] = dict_to_expand["pixel_values"].repeat_interleave(
|
|
1274
|
+
expand_size, dim=0
|
|
1275
|
+
)
|
|
1276
|
+
return dict_to_expand
|
|
1277
|
+
else:
|
|
1278
|
+
image_nums = self._get_image_nums(input_ids).tolist()
|
|
1279
|
+
|
|
1280
|
+
# Get source image counts per sample from image_end_token_id count
|
|
1281
|
+
source_image_nums = [
|
|
1282
|
+
(input_ids[batch_idx] == self.config.image_end_token_id).sum().item()
|
|
1283
|
+
for batch_idx in range(len(image_nums))
|
|
1284
|
+
]
|
|
1285
|
+
|
|
1286
|
+
def _repeat_interleave_samples(x, lengths, repeat_times):
|
|
1287
|
+
samples = torch.split(x, lengths)
|
|
1288
|
+
repeat_args = [repeat_times] + [1] * (x.dim() - 1)
|
|
1289
|
+
result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0)
|
|
1290
|
+
return result
|
|
1291
|
+
|
|
1292
|
+
for key in dict_to_expand:
|
|
1293
|
+
if key == "pixel_values":
|
|
1294
|
+
# Split images into samples based on source image counts
|
|
1295
|
+
if sum(source_image_nums) > 0:
|
|
1296
|
+
# Split grids by sample to compute pixel counts
|
|
1297
|
+
grids_per_sample = torch.split(image_grid_thw, image_nums)
|
|
1298
|
+
lengths = []
|
|
1299
|
+
for batch_idx, sample_grids in enumerate(grids_per_sample):
|
|
1300
|
+
num_source = source_image_nums[batch_idx]
|
|
1301
|
+
if num_source > 0:
|
|
1302
|
+
source_grids = sample_grids[:num_source]
|
|
1303
|
+
lengths.append(torch.prod(source_grids, dim=1).sum().item())
|
|
1304
|
+
else:
|
|
1305
|
+
lengths.append(0)
|
|
1306
|
+
|
|
1307
|
+
dict_to_expand[key] = _repeat_interleave_samples(
|
|
1308
|
+
dict_to_expand[key], lengths=lengths, repeat_times=expand_size
|
|
1309
|
+
)
|
|
1310
|
+
elif key == "image_grid_thw":
|
|
1311
|
+
# Expand all grids (source + target) per sample
|
|
1312
|
+
dict_to_expand[key] = _repeat_interleave_samples(
|
|
1313
|
+
dict_to_expand[key], lengths=image_nums, repeat_times=expand_size
|
|
1314
|
+
)
|
|
1315
|
+
elif key == "images_per_sample":
|
|
1316
|
+
# Simply repeat the counts
|
|
1317
|
+
if dict_to_expand.get(key) is not None:
|
|
1318
|
+
dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
|
|
1319
|
+
return dict_to_expand
|
|
1320
|
+
|
|
1321
|
+
def _expand_dict_for_generation(dict_to_expand):
|
|
1322
|
+
for key in dict_to_expand:
|
|
1323
|
+
if (
|
|
1324
|
+
key != "cache_position"
|
|
1325
|
+
and dict_to_expand[key] is not None
|
|
1326
|
+
and isinstance(dict_to_expand[key], torch.Tensor)
|
|
1327
|
+
and key not in visual_keys
|
|
1328
|
+
):
|
|
1329
|
+
dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
|
|
1330
|
+
return dict_to_expand
|
|
1331
|
+
|
|
1332
|
+
model_kwargs = _expand_dict_for_generation_visual(model_kwargs)
|
|
1333
|
+
|
|
1334
|
+
if input_ids is not None:
|
|
1335
|
+
input_ids = input_ids.repeat_interleave(expand_size, dim=0)
|
|
1336
|
+
|
|
1337
|
+
model_kwargs = _expand_dict_for_generation(model_kwargs)
|
|
1338
|
+
|
|
1339
|
+
if is_encoder_decoder:
|
|
1340
|
+
if model_kwargs.get("encoder_outputs") is None:
|
|
1341
|
+
raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
|
|
1342
|
+
model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
|
|
1343
|
+
|
|
1344
|
+
return input_ids, model_kwargs
|
|
1345
|
+
|
|
1346
|
+
|
|
1347
|
+
def smart_resize(
|
|
1348
|
+
height: int,
|
|
1349
|
+
width: int,
|
|
1350
|
+
factor: int = 16,
|
|
1351
|
+
min_pixels: int = 512 * 512,
|
|
1352
|
+
max_pixels: int = 2048 * 2048,
|
|
1353
|
+
) -> tuple[int, int]:
|
|
1354
|
+
if height < factor or width < factor:
|
|
1355
|
+
raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
|
|
1356
|
+
elif max(height, width) / min(height, width) > 4:
|
|
1357
|
+
raise ValueError(
|
|
1358
|
+
f"absolute aspect ratio must be smaller than 4, got {max(height, width) / min(height, width)}"
|
|
1359
|
+
)
|
|
1360
|
+
|
|
1361
|
+
shortest_edge = int(round(math.sqrt(min_pixels)))
|
|
1362
|
+
longest_edge = int(round(math.sqrt(max_pixels)))
|
|
1363
|
+
min_side = min(height, width)
|
|
1364
|
+
max_side = max(height, width)
|
|
1365
|
+
|
|
1366
|
+
scale = 1.0
|
|
1367
|
+
|
|
1368
|
+
if min_side < shortest_edge:
|
|
1369
|
+
scale = shortest_edge / min_side
|
|
1370
|
+
|
|
1371
|
+
if max_side * scale > longest_edge:
|
|
1372
|
+
scale = longest_edge / max_side
|
|
1373
|
+
|
|
1374
|
+
height = height // 2
|
|
1375
|
+
width = width // 2
|
|
1376
|
+
|
|
1377
|
+
h_bar = max(factor, int(round(height * scale / factor)) * factor)
|
|
1378
|
+
w_bar = max(factor, int(round(width * scale / factor)) * factor)
|
|
1379
|
+
|
|
1380
|
+
if max(h_bar, w_bar) > longest_edge:
|
|
1381
|
+
beta = max(h_bar, w_bar) / longest_edge
|
|
1382
|
+
h_bar = max(factor, int(math.floor((h_bar / beta) / factor)) * factor)
|
|
1383
|
+
w_bar = max(factor, int(math.floor((w_bar / beta) / factor)) * factor)
|
|
1384
|
+
|
|
1385
|
+
return h_bar, w_bar
|
|
1386
|
+
|
|
1387
|
+
|
|
1388
|
+
class GlmImageImageProcessor(Qwen2VLImageProcessor):
|
|
1389
|
+
model_input_names = ["pixel_values", "image_grid_thw", "images_per_sample"]
|
|
1390
|
+
|
|
1391
|
+
|
|
1392
|
+
class GlmImageImageProcessorFast(Qwen2VLImageProcessorFast):
|
|
1393
|
+
model_input_names = ["pixel_values", "image_grid_thw", "images_per_sample"]
|
|
1394
|
+
|
|
1395
|
+
|
|
1396
|
+
class GlmImageImagesKwargs(ImagesKwargs, total=False):
|
|
1397
|
+
"""
|
|
1398
|
+
target_h (`int`):
|
|
1399
|
+
Height of the target image to be generated.
|
|
1400
|
+
target_w (`int`):
|
|
1401
|
+
Width of the target image to be generated.
|
|
1402
|
+
"""
|
|
1403
|
+
|
|
1404
|
+
target_h: int
|
|
1405
|
+
target_w: int
|
|
1406
|
+
|
|
1407
|
+
|
|
1408
|
+
class GlmImageProcessorKwargs(Qwen2VLProcessorKwargs):
|
|
1409
|
+
images_kwargs: GlmImageImagesKwargs
|
|
1410
|
+
|
|
1411
|
+
_defaults = {
|
|
1412
|
+
"text_kwargs": {
|
|
1413
|
+
"padding": False,
|
|
1414
|
+
"return_mm_token_type_ids": False,
|
|
1415
|
+
},
|
|
1416
|
+
"images_kwargs": {
|
|
1417
|
+
"target_h": 1152,
|
|
1418
|
+
"target_w": 768,
|
|
1419
|
+
},
|
|
1420
|
+
}
|
|
1421
|
+
|
|
1422
|
+
|
|
1423
|
+
class GlmImageProcessor(ProcessorMixin):
|
|
1424
|
+
r"""
|
|
1425
|
+
Constructs a GLM-Image processor which wraps a GLM-Image image processor and a GLM-Image tokenizer into a single processor.
|
|
1426
|
+
[`~GlmImageProcessor.__call__`] and [`~GlmImageProcessor.decode`] for more information.
|
|
1427
|
+
Args:
|
|
1428
|
+
image_processor ([`GlmImageProcessor`], *optional*):
|
|
1429
|
+
The image processor is a required input.
|
|
1430
|
+
tokenizer ([`PreTrainedTokenizerFast`], *optional*):
|
|
1431
|
+
The tokenizer is a required input.
|
|
1432
|
+
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
|
|
1433
|
+
in a chat into a tokenizable string.
|
|
1434
|
+
"""
|
|
1435
|
+
|
|
1436
|
+
model_input_names = ["input_ids", "attention_mask", "pixel_values", "image_grid_thw", "images_per_sample"]
|
|
1437
|
+
|
|
1438
|
+
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
|
|
1439
|
+
self.image_token = tokenizer.image_token
|
|
1440
|
+
self.grid_bos_token = tokenizer.grid_bos_token
|
|
1441
|
+
self.grid_eos_token = tokenizer.grid_eos_token
|
|
1442
|
+
self.bos_token = tokenizer.bos_token
|
|
1443
|
+
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
|
|
1444
|
+
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
|
1445
|
+
|
|
1446
|
+
def __call__(
|
|
1447
|
+
self,
|
|
1448
|
+
images: ImageInput | None = None,
|
|
1449
|
+
text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
|
|
1450
|
+
**kwargs: Unpack[GlmImageProcessorKwargs],
|
|
1451
|
+
) -> BatchFeature:
|
|
1452
|
+
"""
|
|
1453
|
+
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
|
1454
|
+
and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
|
|
1455
|
+
the text.
|
|
1456
|
+
|
|
1457
|
+
Args:
|
|
1458
|
+
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
|
1459
|
+
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
|
1460
|
+
tensor. Both channels-first and channels-last formats are supported.
|
|
1461
|
+
text (`str`, `List[str]`, `List[List[str]]`):
|
|
1462
|
+
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
|
1463
|
+
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
|
1464
|
+
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
|
1465
|
+
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
|
1466
|
+
If set, will return tensors of a particular framework. Acceptable values are:
|
|
1467
|
+
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
|
1468
|
+
- `'np'`: Return NumPy `np.ndarray` objects.
|
|
1469
|
+
|
|
1470
|
+
Returns:
|
|
1471
|
+
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
|
1472
|
+
|
|
1473
|
+
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
|
1474
|
+
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
|
1475
|
+
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
|
1476
|
+
`None`).
|
|
1477
|
+
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
|
1478
|
+
- **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
|
|
1479
|
+
"""
|
|
1480
|
+
output_kwargs = self._merge_kwargs(
|
|
1481
|
+
GlmImageProcessorKwargs,
|
|
1482
|
+
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
|
1483
|
+
**kwargs,
|
|
1484
|
+
)
|
|
1485
|
+
|
|
1486
|
+
target_h = output_kwargs["images_kwargs"].pop("target_h", None)
|
|
1487
|
+
target_w = output_kwargs["images_kwargs"].pop("target_w", None)
|
|
1488
|
+
is_text_to_image = images is None
|
|
1489
|
+
|
|
1490
|
+
if images is not None:
|
|
1491
|
+
image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
|
|
1492
|
+
image_grid_thw = image_inputs["image_grid_thw"]
|
|
1493
|
+
else:
|
|
1494
|
+
image_inputs = {}
|
|
1495
|
+
image_grid_thw = None
|
|
1496
|
+
|
|
1497
|
+
# Handle text=None case (image-only processing)
|
|
1498
|
+
if text is None:
|
|
1499
|
+
if images is None:
|
|
1500
|
+
raise ValueError("You must provide at least one of `text` or `images`.")
|
|
1501
|
+
return image_inputs
|
|
1502
|
+
|
|
1503
|
+
if not isinstance(text, list):
|
|
1504
|
+
text = [text]
|
|
1505
|
+
|
|
1506
|
+
batch_size = len(text)
|
|
1507
|
+
text = text.copy() # below lines change text in-place
|
|
1508
|
+
|
|
1509
|
+
# Count images per sample by counting image tokens in each text
|
|
1510
|
+
images_per_sample = []
|
|
1511
|
+
for i in range(batch_size):
|
|
1512
|
+
images_per_sample.append(text[i].count(self.image_token))
|
|
1513
|
+
|
|
1514
|
+
# Replace image tokens with the correct number of placeholder tokens
|
|
1515
|
+
if not is_text_to_image:
|
|
1516
|
+
index = 0
|
|
1517
|
+
for i in range(batch_size):
|
|
1518
|
+
while self.image_token in text[i]:
|
|
1519
|
+
grid = image_grid_thw[index]
|
|
1520
|
+
num_image_tokens = int(grid[1] * grid[2])
|
|
1521
|
+
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
|
|
1522
|
+
index += 1
|
|
1523
|
+
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
|
1524
|
+
|
|
1525
|
+
# Build prompt with target shape and combine grids in a single loop
|
|
1526
|
+
# Format: [sample0_source_grids..., sample0_target_grids, sample1_source_grids..., sample1_target_grids, ...]
|
|
1527
|
+
# Note: In i2i mode, batches are homogeneous (same number of source images per sample)
|
|
1528
|
+
num_source_images = images_per_sample[0] if images_per_sample else 0
|
|
1529
|
+
|
|
1530
|
+
# Validate homogeneity for i2i mode
|
|
1531
|
+
if not is_text_to_image and images_per_sample and len(set(images_per_sample)) != 1:
|
|
1532
|
+
raise ValueError(
|
|
1533
|
+
f"In image-to-image mode, all samples must have the same number of source images. "
|
|
1534
|
+
f"Got different counts: {images_per_sample}"
|
|
1535
|
+
)
|
|
1536
|
+
|
|
1537
|
+
all_grids = []
|
|
1538
|
+
for i in range(batch_size):
|
|
1539
|
+
text[i], token_h, token_w, prev_h, prev_w = self._build_prompt_with_target_shape(
|
|
1540
|
+
text[i], height=target_h, width=target_w, is_text_to_image=is_text_to_image
|
|
1541
|
+
)
|
|
1542
|
+
# Add source grids for this sample (i2i mode only)
|
|
1543
|
+
if not is_text_to_image and num_source_images > 0:
|
|
1544
|
+
start_idx = i * num_source_images
|
|
1545
|
+
all_grids.append(image_grid_thw[start_idx : start_idx + num_source_images])
|
|
1546
|
+
# Add target grid for this sample
|
|
1547
|
+
all_grids.append(
|
|
1548
|
+
self._build_target_image_grid_thw(
|
|
1549
|
+
token_h=token_h,
|
|
1550
|
+
token_w=token_w,
|
|
1551
|
+
prev_token_h=prev_h,
|
|
1552
|
+
prev_token_w=prev_w,
|
|
1553
|
+
is_text_to_image=is_text_to_image,
|
|
1554
|
+
)
|
|
1555
|
+
)
|
|
1556
|
+
image_inputs["image_grid_thw"] = torch.cat(all_grids, dim=0)
|
|
1557
|
+
|
|
1558
|
+
# Store images_per_sample for later use (add target images count)
|
|
1559
|
+
# Each sample will have: source_images + target_images (typically 2 for t2i, 1 for i2i)
|
|
1560
|
+
num_target_grids = 2 if is_text_to_image else 1
|
|
1561
|
+
image_inputs["images_per_sample"] = torch.tensor(
|
|
1562
|
+
[num_source_images + num_target_grids] * batch_size, dtype=torch.long
|
|
1563
|
+
)
|
|
1564
|
+
|
|
1565
|
+
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
|
1566
|
+
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
|
|
1567
|
+
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
|
1568
|
+
|
|
1569
|
+
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
|
1570
|
+
|
|
1571
|
+
if return_mm_token_type_ids:
|
|
1572
|
+
array_ids = np.array(text_inputs["input_ids"])
|
|
1573
|
+
mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
|
|
1574
|
+
mm_token_type_ids[array_ids == self.image_token_id] = 1
|
|
1575
|
+
text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
|
|
1576
|
+
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
|
|
1577
|
+
|
|
1578
|
+
def _build_prompt_with_target_shape(
|
|
1579
|
+
self,
|
|
1580
|
+
prompt: str,
|
|
1581
|
+
height: int,
|
|
1582
|
+
width: int,
|
|
1583
|
+
is_text_to_image: bool,
|
|
1584
|
+
) -> tuple[str, int, int, int, int]:
|
|
1585
|
+
factor = 32
|
|
1586
|
+
height = (height // factor) * factor
|
|
1587
|
+
width = (width // factor) * factor
|
|
1588
|
+
token_h = height // factor
|
|
1589
|
+
token_w = width // factor
|
|
1590
|
+
ratio = token_h / token_w
|
|
1591
|
+
prev_token_h = int(math.sqrt(ratio) * (factor // 2))
|
|
1592
|
+
prev_token_w = int(math.sqrt(1 / ratio) * (factor // 2))
|
|
1593
|
+
|
|
1594
|
+
if is_text_to_image:
|
|
1595
|
+
expanded_prompt = f"{prompt}{self.grid_bos_token}{token_h} {token_w}{self.grid_eos_token}{self.grid_bos_token}{prev_token_h} {prev_token_w}{self.grid_eos_token}{self.bos_token}"
|
|
1596
|
+
else:
|
|
1597
|
+
expanded_prompt = f"{prompt}{self.grid_bos_token}{token_h} {token_w}{self.grid_eos_token}{self.bos_token}"
|
|
1598
|
+
|
|
1599
|
+
return expanded_prompt, token_h, token_w, prev_token_h, prev_token_w
|
|
1600
|
+
|
|
1601
|
+
@staticmethod
|
|
1602
|
+
def _build_target_image_grid_thw(
|
|
1603
|
+
token_h: int,
|
|
1604
|
+
token_w: int,
|
|
1605
|
+
prev_token_h: int,
|
|
1606
|
+
prev_token_w: int,
|
|
1607
|
+
is_text_to_image: bool = True,
|
|
1608
|
+
):
|
|
1609
|
+
if is_text_to_image:
|
|
1610
|
+
# Text-to-image: 2 target grids (large + small preview)
|
|
1611
|
+
return torch.tensor(
|
|
1612
|
+
[
|
|
1613
|
+
[1, token_h, token_w],
|
|
1614
|
+
[1, prev_token_h, prev_token_w],
|
|
1615
|
+
],
|
|
1616
|
+
)
|
|
1617
|
+
else:
|
|
1618
|
+
# Image-to-image: 1 target grid only
|
|
1619
|
+
return torch.tensor(
|
|
1620
|
+
[
|
|
1621
|
+
[1, token_h, token_w],
|
|
1622
|
+
],
|
|
1623
|
+
)
|
|
1624
|
+
|
|
1625
|
+
|
|
1626
|
+
__all__ = [
|
|
1627
|
+
"GlmImageVQVAEConfig",
|
|
1628
|
+
"GlmImageVisionConfig",
|
|
1629
|
+
"GlmImageTextConfig",
|
|
1630
|
+
"GlmImageConfig",
|
|
1631
|
+
"GlmImagePreTrainedModel",
|
|
1632
|
+
"GlmImageVQVAE",
|
|
1633
|
+
"GlmImageVisionModel",
|
|
1634
|
+
"GlmImageTextModel",
|
|
1635
|
+
"GlmImageModel",
|
|
1636
|
+
"GlmImageForConditionalGeneration",
|
|
1637
|
+
"GlmImageImageProcessor",
|
|
1638
|
+
"GlmImageImageProcessorFast",
|
|
1639
|
+
"GlmImageProcessor",
|
|
1640
|
+
]
|