transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
# Copyright 2026 the HuggingFace Team. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from collections.abc import Callable
|
|
16
|
+
|
|
17
|
+
import torch
|
|
18
|
+
import torch.nn as nn
|
|
19
|
+
import torch.nn.functional as F
|
|
20
|
+
|
|
21
|
+
from ...modeling_outputs import BaseModelOutputWithPooling
|
|
22
|
+
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
|
23
|
+
from ..glm4v.configuration_glm4v import Glm4vConfig, Glm4vTextConfig, Glm4vVisionConfig
|
|
24
|
+
from ..glm4v.modeling_glm4v import (
|
|
25
|
+
Glm4vForConditionalGeneration,
|
|
26
|
+
Glm4VisionMlp,
|
|
27
|
+
Glm4vModel,
|
|
28
|
+
Glm4vModelOutputWithPast,
|
|
29
|
+
Glm4vPreTrainedModel,
|
|
30
|
+
Glm4vRMSNorm,
|
|
31
|
+
Glm4vTextAttention,
|
|
32
|
+
Glm4vVisionAttention,
|
|
33
|
+
Glm4vVisionBlock,
|
|
34
|
+
Glm4vVisionModel,
|
|
35
|
+
Glm4vVisionPatchMerger,
|
|
36
|
+
apply_rotary_pos_emb_vision,
|
|
37
|
+
eager_attention_forward,
|
|
38
|
+
is_flash_attention_requested,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class GlmOcrRMSNorm(Glm4vRMSNorm):
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class GlmOcrVisionMlp(Glm4VisionMlp):
|
|
47
|
+
def __init__(self, config, bias: bool = True):
|
|
48
|
+
super().__init__(config)
|
|
49
|
+
self.intermediate_size = config.intermediate_size
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class GlmOcrVisionConfig(Glm4vVisionConfig):
|
|
53
|
+
r"""
|
|
54
|
+
This is the configuration class to store the configuration of a [`GlmOcrVisionConfig`]. It is used to instantiate a
|
|
55
|
+
GLM-OCR model according to the specified arguments, defining the model architecture. Instantiating a
|
|
56
|
+
configuration with the defaults will yield a similar configuration to that of
|
|
57
|
+
GLM-OCR [zai-org/GLM-OCR](https://huggingface.co/zai-org/GLM-OCR).
|
|
58
|
+
|
|
59
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
60
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
depth (`int`, *optional*, defaults to 24):
|
|
64
|
+
Number of layers (depth) in the model.
|
|
65
|
+
hidden_size (`int`, *optional*, defaults to 1024):
|
|
66
|
+
Dimensionality of the encoder layers and the pooler layer.
|
|
67
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
|
68
|
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"silu"`,
|
|
69
|
+
`"relu"`, `"selu"` and `"gelu_new"` are supported.
|
|
70
|
+
attention_bias (`bool`, *optional*, defaults to `True`):
|
|
71
|
+
Whether to add a bias to the queries, keys and values.
|
|
72
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
73
|
+
Dropout probability for attention weights.
|
|
74
|
+
num_heads (`int`, *optional*, defaults to 16):
|
|
75
|
+
Number of attention heads for each attention layer in the Transformer architecture.
|
|
76
|
+
in_channels (`int`, *optional*, defaults to 3):
|
|
77
|
+
Number of input channels.
|
|
78
|
+
image_size (`int` or `list[int]`, *optional*, defaults to 336):
|
|
79
|
+
The size (resolution) of each image.
|
|
80
|
+
patch_size (`int`, *optional*, defaults to 14):
|
|
81
|
+
The size (resolution) of each patch.
|
|
82
|
+
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
|
83
|
+
The epsilon used by the rms normalization layers.
|
|
84
|
+
spatial_merge_size (`int`, *optional*, defaults to 2):
|
|
85
|
+
The size used for merging spatial dimensions.
|
|
86
|
+
temporal_patch_size (`int`, *optional*, defaults to 2):
|
|
87
|
+
The size used for patches along the temporal dimension.
|
|
88
|
+
out_hidden_size (`int`, *optional*, defaults to 1536):
|
|
89
|
+
The output hidden size of the vision model.
|
|
90
|
+
intermediate_size (`int`, *optional*, defaults to 4096):
|
|
91
|
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
|
92
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
93
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
def __init__(
|
|
97
|
+
self,
|
|
98
|
+
depth=24,
|
|
99
|
+
hidden_size=1024,
|
|
100
|
+
hidden_act="silu",
|
|
101
|
+
attention_bias=True,
|
|
102
|
+
num_heads=16,
|
|
103
|
+
image_size=336,
|
|
104
|
+
out_hidden_size=1536,
|
|
105
|
+
intermediate_size=4096,
|
|
106
|
+
**super_kwargs,
|
|
107
|
+
):
|
|
108
|
+
super().__init__(**super_kwargs)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class GlmOcrTextConfig(Glm4vTextConfig):
|
|
112
|
+
r"""
|
|
113
|
+
This is the configuration class to store the configuration of a [`GlmOcrTextConfig`]. It is used to instantiate a
|
|
114
|
+
GLM-OCR model according to the specified arguments, defining the model architecture. Instantiating a
|
|
115
|
+
configuration with the defaults will yield a similar configuration to that of
|
|
116
|
+
GLM-OCR [zai-org/GLM-OCR](https://huggingface.co/zai-org/GLM-OCR).
|
|
117
|
+
|
|
118
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
119
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
vocab_size (`int`, *optional*, defaults to 59392):
|
|
123
|
+
Vocabulary size of the GlmOcr model. Defines the number of different tokens that can be represented by the
|
|
124
|
+
`inputs_ids` passed when calling [`GlmOcrModel`]
|
|
125
|
+
hidden_size (`int`, *optional*, defaults to 1024):
|
|
126
|
+
Dimension of the hidden representations.
|
|
127
|
+
intermediate_size (`int`, *optional*, defaults to 4096):
|
|
128
|
+
Dimension of the MLP representations.
|
|
129
|
+
num_hidden_layers (`int`, *optional*, defaults to 16):
|
|
130
|
+
Number of hidden layers in the Transformer encoder.
|
|
131
|
+
num_attention_heads (`int`, *optional*, defaults to 16):
|
|
132
|
+
Number of attention heads for each attention layer in the Transformer encoder.
|
|
133
|
+
num_key_value_heads (`int`, *optional*, defaults to 8):
|
|
134
|
+
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
|
135
|
+
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
|
136
|
+
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
|
137
|
+
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
|
138
|
+
by meanpooling all the original heads within that group. For more details checkout [this
|
|
139
|
+
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
|
|
140
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
|
141
|
+
The non-linear activation function (function or string) in the decoder.
|
|
142
|
+
max_position_embeddings (`int`, *optional*, defaults to 131072):
|
|
143
|
+
The maximum sequence length that this model might ever be used with.
|
|
144
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
145
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
146
|
+
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
|
147
|
+
The epsilon used by the rms normalization layers.
|
|
148
|
+
use_cache (`bool`, *optional*, defaults to `True`):
|
|
149
|
+
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
|
150
|
+
relevant if `config.is_decoder=True`.
|
|
151
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
152
|
+
The dropout ratio for the attention probabilities.
|
|
153
|
+
rope_parameters (`RopeParameters`, *optional*):
|
|
154
|
+
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
|
|
155
|
+
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
|
|
156
|
+
with longer `max_position_embeddings`.
|
|
157
|
+
pad_token_id (`int`, *optional*):
|
|
158
|
+
The id of the padding token.
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
>>> from transformers import GlmOcrTextModel, GlmOcrConfig
|
|
162
|
+
|
|
163
|
+
>>> # Initializing a GLM-OCR style configuration
|
|
164
|
+
>>> configuration = GlmOcrConfig()
|
|
165
|
+
|
|
166
|
+
>>> # Initializing a model from the GLM-OCR style configuration
|
|
167
|
+
>>> model = GlmOcrTextModel(configuration)
|
|
168
|
+
|
|
169
|
+
>>> # Accessing the model configuration
|
|
170
|
+
>>> configuration = model.config
|
|
171
|
+
```"""
|
|
172
|
+
|
|
173
|
+
def __init__(
|
|
174
|
+
self,
|
|
175
|
+
vocab_size: int | None = 59392,
|
|
176
|
+
hidden_size: int | None = 1024,
|
|
177
|
+
intermediate_size: int | None = 4096,
|
|
178
|
+
num_hidden_layers: int | None = 16,
|
|
179
|
+
num_attention_heads: int | None = 16,
|
|
180
|
+
num_key_value_heads: int | None = 8,
|
|
181
|
+
max_position_embeddings: int | None = 131072,
|
|
182
|
+
**super_kwargs,
|
|
183
|
+
):
|
|
184
|
+
super().__init__(**super_kwargs)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class GlmOcrConfig(Glm4vConfig):
|
|
188
|
+
r"""
|
|
189
|
+
This is the configuration class to store the configuration of a [`GlmOcrModel`]. It is used to instantiate a
|
|
190
|
+
GLM-OCR model according to the specified arguments, defining the model architecture. Instantiating a
|
|
191
|
+
configuration with the defaults will yield a similar configuration to that of
|
|
192
|
+
GLM-OCR [zai-org/GLM-OCR](https://huggingface.co/zai-org/GLM-OCR).
|
|
193
|
+
|
|
194
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
195
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `GlmOcrTextConfig`):
|
|
199
|
+
The config object or dictionary of the text backbone.
|
|
200
|
+
vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `GlmOcrVisionConfig`):
|
|
201
|
+
The config object or dictionary of the vision backbone.
|
|
202
|
+
image_token_id (`int`, *optional*, defaults to 59280):
|
|
203
|
+
The image token index to encode the image prompt.
|
|
204
|
+
video_token_id (`int`, *optional*, defaults to 59281):
|
|
205
|
+
The video token index to encode the image prompt.
|
|
206
|
+
image_start_token_id (`int`, *optional*, defaults to 59256):
|
|
207
|
+
The image start token index to encode the start of image.
|
|
208
|
+
image_end_token_id (`int`, *optional*, defaults to 59257):
|
|
209
|
+
The image end token index to encode the end of image.
|
|
210
|
+
video_start_token_id (`int`, *optional*, defaults to 59258):
|
|
211
|
+
The video start token index to encode the start of video.
|
|
212
|
+
video_end_token_id (`int`, *optional*, defaults to 59259):
|
|
213
|
+
The video end token index to encode the end of video.
|
|
214
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
215
|
+
Whether the model's input and output word embeddings should be tied.
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
>>> from transformers import GlmOcrForConditionalGeneration, GlmOcrConfig
|
|
219
|
+
|
|
220
|
+
>>> # Initializing a GLM-OCR style configuration
|
|
221
|
+
>>> configuration = GlmOcrConfig()
|
|
222
|
+
|
|
223
|
+
>>> # Initializing a model from the GLM-OCR style configuration
|
|
224
|
+
>>> model = GlmOcrForConditionalGeneration(configuration)
|
|
225
|
+
|
|
226
|
+
>>> # Accessing the model configuration
|
|
227
|
+
>>> configuration = model.config
|
|
228
|
+
```"""
|
|
229
|
+
|
|
230
|
+
def __init__(
|
|
231
|
+
self,
|
|
232
|
+
text_config=None,
|
|
233
|
+
vision_config=None,
|
|
234
|
+
image_token_id=59280,
|
|
235
|
+
video_token_id=59281,
|
|
236
|
+
image_start_token_id=59256,
|
|
237
|
+
image_end_token_id=59257,
|
|
238
|
+
video_start_token_id=59258,
|
|
239
|
+
video_end_token_id=59259,
|
|
240
|
+
tie_word_embeddings=False,
|
|
241
|
+
**super_kwargs,
|
|
242
|
+
):
|
|
243
|
+
super().__init__(**super_kwargs)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class GlmOcrTextAttention(Glm4vTextAttention, nn.Module):
|
|
247
|
+
def __init__(self, config: GlmOcrTextConfig, layer_idx: int | None = None):
|
|
248
|
+
super().__init__()
|
|
249
|
+
self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
|
250
|
+
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
|
|
251
|
+
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
|
|
252
|
+
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
class GlmOcrPreTrainedModel(Glm4vPreTrainedModel):
|
|
256
|
+
_keys_to_ignore_on_load_unexpected = [r"model\.language_model\.layers\.16.*"]
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class GlmOcrModelOutputWithPast(Glm4vModelOutputWithPast):
|
|
260
|
+
pass
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
class GlmOcrVisionAttention(Glm4vVisionAttention):
|
|
264
|
+
def __init__(self, config: GlmOcrVisionConfig) -> None:
|
|
265
|
+
super().__init__()
|
|
266
|
+
self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.attention_bias)
|
|
267
|
+
self.proj = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias)
|
|
268
|
+
self.q_norm = GlmOcrRMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
|
269
|
+
self.k_norm = GlmOcrRMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
|
270
|
+
|
|
271
|
+
def forward(
|
|
272
|
+
self,
|
|
273
|
+
hidden_states: torch.Tensor,
|
|
274
|
+
cu_seqlens: torch.Tensor,
|
|
275
|
+
rotary_pos_emb: torch.Tensor | None = None,
|
|
276
|
+
position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
|
|
277
|
+
**kwargs,
|
|
278
|
+
) -> torch.Tensor:
|
|
279
|
+
seq_length = hidden_states.shape[0]
|
|
280
|
+
query_states, key_states, value_states = (
|
|
281
|
+
self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
query_states = self.q_norm(query_states)
|
|
285
|
+
key_states = self.k_norm(key_states)
|
|
286
|
+
|
|
287
|
+
cos, sin = position_embeddings
|
|
288
|
+
query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin)
|
|
289
|
+
query_states = query_states.transpose(0, 1).unsqueeze(0)
|
|
290
|
+
key_states = key_states.transpose(0, 1).unsqueeze(0)
|
|
291
|
+
value_states = value_states.transpose(0, 1).unsqueeze(0)
|
|
292
|
+
|
|
293
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
294
|
+
self.config._attn_implementation, eager_attention_forward
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
if is_flash_attention_requested(self.config):
|
|
298
|
+
# Flash Attention: Use cu_seqlens for variable length attention
|
|
299
|
+
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
|
|
300
|
+
attn_output, _ = attention_interface(
|
|
301
|
+
self,
|
|
302
|
+
query_states,
|
|
303
|
+
key_states,
|
|
304
|
+
value_states,
|
|
305
|
+
attention_mask=None,
|
|
306
|
+
scaling=self.scaling,
|
|
307
|
+
dropout=0.0 if not self.training else self.attention_dropout,
|
|
308
|
+
cu_seq_lens_q=cu_seqlens,
|
|
309
|
+
cu_seq_lens_k=cu_seqlens,
|
|
310
|
+
max_length_q=max_seqlen,
|
|
311
|
+
max_length_k=max_seqlen,
|
|
312
|
+
is_causal=False,
|
|
313
|
+
**kwargs,
|
|
314
|
+
)
|
|
315
|
+
else:
|
|
316
|
+
# Other implementations: Process each chunk separately
|
|
317
|
+
lengths = cu_seqlens[1:] - cu_seqlens[:-1]
|
|
318
|
+
splits = [
|
|
319
|
+
torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
|
|
320
|
+
]
|
|
321
|
+
|
|
322
|
+
attn_outputs = [
|
|
323
|
+
attention_interface(
|
|
324
|
+
self,
|
|
325
|
+
q,
|
|
326
|
+
k,
|
|
327
|
+
v,
|
|
328
|
+
attention_mask=None,
|
|
329
|
+
scaling=self.scaling,
|
|
330
|
+
dropout=0.0 if not self.training else self.attention_dropout,
|
|
331
|
+
is_causal=False,
|
|
332
|
+
**kwargs,
|
|
333
|
+
)[0]
|
|
334
|
+
for q, k, v in zip(*splits)
|
|
335
|
+
]
|
|
336
|
+
attn_output = torch.cat(attn_outputs, dim=1)
|
|
337
|
+
|
|
338
|
+
attn_output = attn_output.reshape(seq_length, -1).contiguous()
|
|
339
|
+
attn_output = self.proj(attn_output)
|
|
340
|
+
return attn_output
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
class GlmOcrVisionBlock(Glm4vVisionBlock):
|
|
344
|
+
def __init__(self, config) -> None:
|
|
345
|
+
super().__init__()
|
|
346
|
+
self.mlp = GlmOcrVisionMlp(config, bias=config.attention_bias)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
class GlmOcrVisionPatchMerger(Glm4vVisionPatchMerger):
|
|
350
|
+
pass
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
class GlmOcrVisionModel(Glm4vVisionModel):
|
|
354
|
+
def __init__(self, config) -> None:
|
|
355
|
+
super().__init__(config)
|
|
356
|
+
del self.embeddings
|
|
357
|
+
del self.post_conv_layernorm
|
|
358
|
+
self.merger = GlmOcrVisionPatchMerger(
|
|
359
|
+
dim=config.out_hidden_size,
|
|
360
|
+
context_dim=config.out_hidden_size * config.in_channels,
|
|
361
|
+
hidden_act=config.hidden_act,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
|
|
365
|
+
r"""
|
|
366
|
+
hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
|
|
367
|
+
The final hidden states of the model.
|
|
368
|
+
grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
|
|
369
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
`torch.Tensor`: hidden_states.
|
|
373
|
+
"""
|
|
374
|
+
hidden_states = self.patch_embed(hidden_states)
|
|
375
|
+
rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
|
|
376
|
+
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
|
|
377
|
+
position_embeddings = (emb.cos(), emb.sin())
|
|
378
|
+
|
|
379
|
+
cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
|
|
380
|
+
dim=0,
|
|
381
|
+
# Select dtype based on the following factors:
|
|
382
|
+
# - FA2 requires that cu_seqlens_q must have dtype int32
|
|
383
|
+
# - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
|
|
384
|
+
# See https://github.com/huggingface/transformers/pull/34852 for more information
|
|
385
|
+
dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
|
|
386
|
+
)
|
|
387
|
+
cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
|
|
388
|
+
|
|
389
|
+
for blk in self.blocks:
|
|
390
|
+
hidden_states = blk(
|
|
391
|
+
hidden_states,
|
|
392
|
+
cu_seqlens=cu_seqlens,
|
|
393
|
+
position_embeddings=position_embeddings,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
hidden_states = self.post_layernorm(hidden_states)
|
|
397
|
+
|
|
398
|
+
hidden_states = hidden_states.view(
|
|
399
|
+
-1, self.spatial_merge_size, self.spatial_merge_size, hidden_states.shape[-1]
|
|
400
|
+
)
|
|
401
|
+
hidden_states = hidden_states.permute(0, 3, 1, 2)
|
|
402
|
+
hidden_states = self.downsample(hidden_states).view(-1, self.config.out_hidden_size)
|
|
403
|
+
|
|
404
|
+
merged_hidden_states = self.merger(hidden_states)
|
|
405
|
+
return BaseModelOutputWithPooling(
|
|
406
|
+
last_hidden_state=hidden_states,
|
|
407
|
+
pooler_output=merged_hidden_states,
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
class GlmOcrModel(Glm4vModel):
|
|
412
|
+
pass
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
class GlmOcrForConditionalGeneration(Glm4vForConditionalGeneration):
|
|
416
|
+
pass
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
__all__ = [
|
|
420
|
+
"GlmOcrConfig",
|
|
421
|
+
"GlmOcrTextConfig",
|
|
422
|
+
"GlmOcrVisionConfig",
|
|
423
|
+
"GlmOcrTextModel", # noqa: F822
|
|
424
|
+
"GlmOcrVisionModel",
|
|
425
|
+
"GlmOcrModel",
|
|
426
|
+
"GlmOcrPreTrainedModel",
|
|
427
|
+
"GlmOcrForConditionalGeneration",
|
|
428
|
+
]
|
|
@@ -26,12 +26,12 @@ from ...cache_utils import Cache
|
|
|
26
26
|
from ...generation import GenerationMixin
|
|
27
27
|
from ...integrations import use_kernelized_func
|
|
28
28
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
29
|
-
from ...modeling_outputs import
|
|
29
|
+
from ...modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast
|
|
30
30
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
31
31
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
32
32
|
from ...processing_utils import Unpack
|
|
33
|
-
from ...utils import TransformersKwargs, auto_docstring,
|
|
34
|
-
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
33
|
+
from ...utils import TransformersKwargs, auto_docstring, is_torch_available
|
|
34
|
+
from ...utils.generic import can_return_tuple, check_model_inputs, maybe_autocast
|
|
35
35
|
from ..auto import AutoModel, AutoModelForCausalLM
|
|
36
36
|
from .configuration_glmasr import GlmAsrConfig, GlmAsrEncoderConfig
|
|
37
37
|
|
|
@@ -205,9 +205,9 @@ class GlmAsrAttention(nn.Module):
|
|
|
205
205
|
cos, sin = position_embeddings
|
|
206
206
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
|
207
207
|
|
|
208
|
-
attention_interface: Callable =
|
|
209
|
-
|
|
210
|
-
|
|
208
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
209
|
+
self.config._attn_implementation, eager_attention_forward
|
|
210
|
+
)
|
|
211
211
|
|
|
212
212
|
attn_output, attn_weights = attention_interface(
|
|
213
213
|
self,
|
|
@@ -292,6 +292,10 @@ class GlmAsrEncoder(GlmAsrPreTrainedModel):
|
|
|
292
292
|
main_input_name = "input_features"
|
|
293
293
|
input_modalities = "audio"
|
|
294
294
|
_no_split_modules = ["GlmAsrEncoderLayer"]
|
|
295
|
+
_can_record_outputs = {
|
|
296
|
+
"hidden_states": GlmAsrEncoderLayer,
|
|
297
|
+
"attentions": GlmAsrAttention,
|
|
298
|
+
}
|
|
295
299
|
|
|
296
300
|
def __init__(self, config: GlmAsrEncoderConfig):
|
|
297
301
|
super().__init__(config)
|
|
@@ -322,7 +326,7 @@ class GlmAsrEncoder(GlmAsrPreTrainedModel):
|
|
|
322
326
|
hidden_states = encoder_layer(hidden_states, position_embeddings=position_embeddings, **kwargs)
|
|
323
327
|
|
|
324
328
|
hidden_states = self.norm(hidden_states)
|
|
325
|
-
return
|
|
329
|
+
return BaseModelOutputWithPooling(last_hidden_state=hidden_states)
|
|
326
330
|
|
|
327
331
|
|
|
328
332
|
class GlmAsrMultiModalProjector(nn.Module):
|
|
@@ -382,26 +386,27 @@ class GlmAsrForConditionalGeneration(GlmAsrPreTrainedModel, GenerationMixin):
|
|
|
382
386
|
def get_decoder(self):
|
|
383
387
|
return self.language_model.get_decoder()
|
|
384
388
|
|
|
389
|
+
@can_return_tuple
|
|
390
|
+
@auto_docstring(
|
|
391
|
+
custom_intro="Compute audio embeddings from log-mel input features using the audio encoder and multi-modal projector."
|
|
392
|
+
)
|
|
385
393
|
def get_audio_features(
|
|
386
|
-
self,
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
Returns:
|
|
401
|
-
`torch.FloatTensor`:
|
|
402
|
-
The audio embeddings.
|
|
394
|
+
self,
|
|
395
|
+
input_features: torch.FloatTensor,
|
|
396
|
+
input_features_mask: torch.Tensor,
|
|
397
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
398
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
399
|
+
r"""
|
|
400
|
+
input_features (`torch.FloatTensor`):
|
|
401
|
+
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
|
|
402
|
+
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
|
|
403
|
+
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
|
|
404
|
+
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
|
|
405
|
+
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
|
|
406
|
+
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
|
|
407
|
+
Mask to avoid performing attention on padded feature indices.
|
|
403
408
|
"""
|
|
404
|
-
audio_outputs = self.audio_tower(input_features)
|
|
409
|
+
audio_outputs = self.audio_tower(input_features, return_dict=True, **kwargs)
|
|
405
410
|
audio_hidden_states = audio_outputs.last_hidden_state
|
|
406
411
|
audio_hidden_states = audio_hidden_states.reshape(
|
|
407
412
|
input_features.shape[0], -1, self.config.audio_config.intermediate_size
|
|
@@ -415,8 +420,9 @@ class GlmAsrForConditionalGeneration(GlmAsrPreTrainedModel, GenerationMixin):
|
|
|
415
420
|
post_lengths = (audio_lengths - merge_factor) // merge_factor + 1
|
|
416
421
|
|
|
417
422
|
valid_mask = torch.arange(audio_embeds.shape[1], device=post_lengths.device)[None, :] < post_lengths[:, None]
|
|
418
|
-
|
|
419
|
-
|
|
423
|
+
audio_outputs.pooler_output = audio_embeds[valid_mask.to(audio_embeds.device)]
|
|
424
|
+
|
|
425
|
+
return audio_outputs
|
|
420
426
|
|
|
421
427
|
@can_return_tuple
|
|
422
428
|
@auto_docstring
|
|
@@ -468,7 +474,7 @@ class GlmAsrForConditionalGeneration(GlmAsrPreTrainedModel, GenerationMixin):
|
|
|
468
474
|
inputs_embeds = self.get_input_embeddings()(input_ids)
|
|
469
475
|
|
|
470
476
|
if input_features is not None and input_ids is not None:
|
|
471
|
-
audio_embeds = self.get_audio_features(input_features, input_features_mask)
|
|
477
|
+
audio_embeds = self.get_audio_features(input_features, input_features_mask, return_dict=True).pooler_output
|
|
472
478
|
|
|
473
479
|
# replace text-audio token placeholders with audio embeddings
|
|
474
480
|
audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
|
|
@@ -21,11 +21,11 @@ from ...audio_utils import AudioInput, make_list_of_audio
|
|
|
21
21
|
from ...cache_utils import Cache
|
|
22
22
|
from ...feature_extraction_utils import BatchFeature
|
|
23
23
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
24
|
-
from ...modeling_outputs import
|
|
24
|
+
from ...modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast
|
|
25
25
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
|
26
26
|
from ...processing_utils import Unpack
|
|
27
27
|
from ...utils import TransformersKwargs, auto_docstring, is_torch_available, logging
|
|
28
|
-
from ...utils.generic import check_model_inputs
|
|
28
|
+
from ...utils.generic import can_return_tuple, check_model_inputs
|
|
29
29
|
from ..audioflamingo3.modeling_audioflamingo3 import (
|
|
30
30
|
AudioFlamingo3ForConditionalGeneration,
|
|
31
31
|
AudioFlamingo3MultiModalProjector,
|
|
@@ -227,9 +227,9 @@ class GlmAsrAttention(LlamaAttention):
|
|
|
227
227
|
cos, sin = position_embeddings
|
|
228
228
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
|
229
229
|
|
|
230
|
-
attention_interface: Callable =
|
|
231
|
-
|
|
232
|
-
|
|
230
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
231
|
+
self.config._attn_implementation, eager_attention_forward
|
|
232
|
+
)
|
|
233
233
|
|
|
234
234
|
attn_output, attn_weights = attention_interface(
|
|
235
235
|
self,
|
|
@@ -305,6 +305,10 @@ class GlmAsrEncoder(GlmAsrPreTrainedModel):
|
|
|
305
305
|
main_input_name = "input_features"
|
|
306
306
|
input_modalities = "audio"
|
|
307
307
|
_no_split_modules = ["GlmAsrEncoderLayer"]
|
|
308
|
+
_can_record_outputs = {
|
|
309
|
+
"hidden_states": GlmAsrEncoderLayer,
|
|
310
|
+
"attentions": GlmAsrAttention,
|
|
311
|
+
}
|
|
308
312
|
|
|
309
313
|
def __init__(self, config: GlmAsrEncoderConfig):
|
|
310
314
|
super().__init__(config)
|
|
@@ -335,7 +339,7 @@ class GlmAsrEncoder(GlmAsrPreTrainedModel):
|
|
|
335
339
|
hidden_states = encoder_layer(hidden_states, position_embeddings=position_embeddings, **kwargs)
|
|
336
340
|
|
|
337
341
|
hidden_states = self.norm(hidden_states)
|
|
338
|
-
return
|
|
342
|
+
return BaseModelOutputWithPooling(last_hidden_state=hidden_states)
|
|
339
343
|
|
|
340
344
|
|
|
341
345
|
class GlmAsrMultiModalProjector(AudioFlamingo3MultiModalProjector):
|
|
@@ -351,10 +355,17 @@ class GlmAsrMultiModalProjector(AudioFlamingo3MultiModalProjector):
|
|
|
351
355
|
"""
|
|
352
356
|
)
|
|
353
357
|
class GlmAsrForConditionalGeneration(AudioFlamingo3ForConditionalGeneration):
|
|
358
|
+
@can_return_tuple
|
|
359
|
+
@auto_docstring(
|
|
360
|
+
custom_intro="Compute audio embeddings from log-mel input features using the audio encoder and multi-modal projector."
|
|
361
|
+
)
|
|
354
362
|
def get_audio_features(
|
|
355
|
-
self,
|
|
356
|
-
|
|
357
|
-
|
|
363
|
+
self,
|
|
364
|
+
input_features: torch.FloatTensor,
|
|
365
|
+
input_features_mask: torch.Tensor,
|
|
366
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
367
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
368
|
+
audio_outputs = self.audio_tower(input_features, return_dict=True, **kwargs)
|
|
358
369
|
audio_hidden_states = audio_outputs.last_hidden_state
|
|
359
370
|
audio_hidden_states = audio_hidden_states.reshape(
|
|
360
371
|
input_features.shape[0], -1, self.config.audio_config.intermediate_size
|
|
@@ -368,8 +379,9 @@ class GlmAsrForConditionalGeneration(AudioFlamingo3ForConditionalGeneration):
|
|
|
368
379
|
post_lengths = (audio_lengths - merge_factor) // merge_factor + 1
|
|
369
380
|
|
|
370
381
|
valid_mask = torch.arange(audio_embeds.shape[1], device=post_lengths.device)[None, :] < post_lengths[:, None]
|
|
371
|
-
|
|
372
|
-
|
|
382
|
+
audio_outputs.pooler_output = audio_embeds[valid_mask.to(audio_embeds.device)]
|
|
383
|
+
|
|
384
|
+
return audio_outputs
|
|
373
385
|
|
|
374
386
|
def forward(
|
|
375
387
|
self,
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
from typing import Optional
|
|
17
17
|
|
|
18
18
|
import torch
|
|
19
|
-
|
|
19
|
+
import torchvision.transforms.v2.functional as tvF
|
|
20
20
|
|
|
21
21
|
from ...image_processing_utils import BatchFeature
|
|
22
22
|
from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
|
|
@@ -50,7 +50,7 @@ class GLPNImageProcessorFast(BaseImageProcessorFast):
|
|
|
50
50
|
self,
|
|
51
51
|
image: "torch.Tensor",
|
|
52
52
|
size_divisor: int,
|
|
53
|
-
interpolation: Optional["
|
|
53
|
+
interpolation: Optional["tvF.InterpolationMode"] = None,
|
|
54
54
|
antialias: bool = True,
|
|
55
55
|
**kwargs,
|
|
56
56
|
) -> "torch.Tensor":
|
|
@@ -83,7 +83,7 @@ class GLPNImageProcessorFast(BaseImageProcessorFast):
|
|
|
83
83
|
images: list["torch.Tensor"],
|
|
84
84
|
do_resize: bool,
|
|
85
85
|
size_divisor: int | None = None,
|
|
86
|
-
interpolation: Optional["
|
|
86
|
+
interpolation: Optional["tvF.InterpolationMode"] = None,
|
|
87
87
|
do_rescale: bool = True,
|
|
88
88
|
rescale_factor: float | None = 1 / 255,
|
|
89
89
|
do_normalize: bool = False,
|
|
@@ -609,10 +609,12 @@ class GLPNForDepthEstimation(GLPNPreTrainedModel):
|
|
|
609
609
|
>>> import torch
|
|
610
610
|
>>> import numpy as np
|
|
611
611
|
>>> from PIL import Image
|
|
612
|
-
>>> import
|
|
612
|
+
>>> import httpx
|
|
613
|
+
>>> from io import BytesIO
|
|
613
614
|
|
|
614
615
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
615
|
-
>>>
|
|
616
|
+
>>> with httpx.stream("GET", url) as response:
|
|
617
|
+
... image = Image.open(BytesIO(response.read()))
|
|
616
618
|
|
|
617
619
|
>>> image_processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-kitti")
|
|
618
620
|
>>> model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti")
|